mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-13 23:50:36 +00:00
Compare commits
1 Commits
gc_feedbac
...
add-trace-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fcc3acfd7e |
184
.github/actions/allure-report-generate/action.yml
vendored
184
.github/actions/allure-report-generate/action.yml
vendored
@@ -1,184 +0,0 @@
|
||||
name: 'Create Allure report'
|
||||
description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'
|
||||
|
||||
outputs:
|
||||
report-url:
|
||||
description: 'Allure report URL'
|
||||
value: ${{ steps.generate-report.outputs.report-url }}
|
||||
report-json-url:
|
||||
description: 'Allure report JSON URL'
|
||||
value: ${{ steps.generate-report.outputs.report-json-url }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
|
||||
steps:
|
||||
# We're using some of env variables quite offen, so let's set them once.
|
||||
#
|
||||
# It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
|
||||
#
|
||||
# - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
|
||||
# - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
|
||||
#
|
||||
- name: Set variables
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${PR_NUMBER}" != "null" ]; then
|
||||
BRANCH_OR_PR=pr-${PR_NUMBER}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
|
||||
# Shortcut for special branches
|
||||
BRANCH_OR_PR=${GITHUB_REF_NAME}
|
||||
else
|
||||
BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
|
||||
fi
|
||||
|
||||
LOCK_FILE=reports/${BRANCH_OR_PR}/lock.txt
|
||||
|
||||
WORKDIR=/tmp/${BRANCH_OR_PR}-$(date +%s)
|
||||
mkdir -p ${WORKDIR}
|
||||
|
||||
echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
|
||||
echo "LOCK_FILE=${LOCK_FILE}" >> $GITHUB_ENV
|
||||
echo "WORKDIR=${WORKDIR}" >> $GITHUB_ENV
|
||||
echo "BUCKET=${BUCKET}" >> $GITHUB_ENV
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
|
||||
# TODO: We can replace with a special docker image with Java and Allure pre-installed
|
||||
- uses: actions/setup-java@v3
|
||||
with:
|
||||
distribution: 'temurin'
|
||||
java-version: '17'
|
||||
|
||||
- name: Install Allure
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
if ! which allure; then
|
||||
ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
|
||||
wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
|
||||
echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c
|
||||
unzip -q ${ALLURE_ZIP}
|
||||
echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
|
||||
rm -f ${ALLURE_ZIP}
|
||||
fi
|
||||
env:
|
||||
ALLURE_VERSION: 2.22.0
|
||||
ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9
|
||||
|
||||
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
|
||||
- name: Acquire lock
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
LOCK_TIMEOUT=300 # seconds
|
||||
|
||||
LOCK_CONTENT="${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
|
||||
echo ${LOCK_CONTENT} > ${WORKDIR}/lock.txt
|
||||
|
||||
# Do it up to 5 times to avoid race condition
|
||||
for _ in $(seq 1 5); do
|
||||
for i in $(seq 1 ${LOCK_TIMEOUT}); do
|
||||
LOCK_ACQUIRED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
|
||||
# `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
|
||||
if [ -z "${LOCK_ACQUIRED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ACQUIRED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
aws s3 mv --only-show-errors ${WORKDIR}/lock.txt "s3://${BUCKET}/${LOCK_FILE}"
|
||||
|
||||
# Double-check that exactly THIS run has acquired the lock
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
|
||||
if [ "$(cat lock.txt)" = "${LOCK_CONTENT}" ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Generate and publish final Allure report
|
||||
id: generate-report
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
REPORT_PREFIX=reports/${BRANCH_OR_PR}
|
||||
RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
|
||||
|
||||
# Get previously uploaded data for this run
|
||||
ZSTD_NBTHREADS=0
|
||||
|
||||
S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[].Key')
|
||||
if [ -z "$S3_FILEPATHS" ]; then
|
||||
# There's no previously uploaded data for this $GITHUB_RUN_ID
|
||||
exit 0
|
||||
fi
|
||||
for S3_FILEPATH in ${S3_FILEPATHS}; do
|
||||
time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"
|
||||
|
||||
archive=${WORKDIR}/$(basename $S3_FILEPATH)
|
||||
mkdir -p ${archive%.tar.zst}
|
||||
time tar -xf ${archive} -C ${archive%.tar.zst}
|
||||
rm -f ${archive}
|
||||
done
|
||||
|
||||
# Get history trend
|
||||
time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${WORKDIR}/latest/history" || true
|
||||
|
||||
# Generate report
|
||||
time allure generate --clean --output ${WORKDIR}/report ${WORKDIR}/*
|
||||
|
||||
# Replace a logo link with a redirect to the latest version of the report
|
||||
sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html?nocache='"'+Date.now()+'"'" class=|g' ${WORKDIR}/report/app.js
|
||||
|
||||
# Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
|
||||
time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
|
||||
time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
||||
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
|
||||
|
||||
# Generate redirect
|
||||
cat <<EOF > ${WORKDIR}/index.html
|
||||
<!DOCTYPE html>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<title>Redirecting to ${REPORT_URL}</title>
|
||||
<meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
|
||||
EOF
|
||||
time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
||||
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Release lock
|
||||
if: always()
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
|
||||
|
||||
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" ]; then
|
||||
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
|
||||
fi
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
if [ -d "${WORKDIR}" ]; then
|
||||
rm -rf ${WORKDIR}
|
||||
fi
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: always()
|
||||
env:
|
||||
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
with:
|
||||
script: |
|
||||
const { REPORT_URL, COMMIT_SHA } = process.env
|
||||
|
||||
await github.rest.repos.createCommitStatus({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
sha: `${COMMIT_SHA}`,
|
||||
state: 'success',
|
||||
target_url: `${REPORT_URL}`,
|
||||
context: 'Allure report',
|
||||
})
|
||||
72
.github/actions/allure-report-store/action.yml
vendored
72
.github/actions/allure-report-store/action.yml
vendored
@@ -1,72 +0,0 @@
|
||||
name: 'Store Allure results'
|
||||
description: 'Upload test results to be used by actions/allure-report-generate'
|
||||
|
||||
inputs:
|
||||
report-dir:
|
||||
description: 'directory with test results generated by tests'
|
||||
required: true
|
||||
unique-key:
|
||||
description: 'string to distinguish different results in the same run'
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
|
||||
steps:
|
||||
- name: Set variables
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${PR_NUMBER}" != "null" ]; then
|
||||
BRANCH_OR_PR=pr-${PR_NUMBER}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
|
||||
# Shortcut for special branches
|
||||
BRANCH_OR_PR=${GITHUB_REF_NAME}
|
||||
else
|
||||
BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
|
||||
fi
|
||||
|
||||
echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
|
||||
echo "REPORT_DIR=${REPORT_DIR}" >> $GITHUB_ENV
|
||||
env:
|
||||
REPORT_DIR: ${{ inputs.report-dir }}
|
||||
|
||||
- name: Upload test results
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
REPORT_PREFIX=reports/${BRANCH_OR_PR}
|
||||
RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
|
||||
|
||||
# Add metadata
|
||||
cat <<EOF > ${REPORT_DIR}/executor.json
|
||||
{
|
||||
"name": "GitHub Actions",
|
||||
"type": "github",
|
||||
"url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
|
||||
"buildOrder": ${GITHUB_RUN_ID},
|
||||
"buildName": "GitHub Actions Run #${GITHUB_RUN_NUMBER}/${GITHUB_RUN_ATTEMPT}",
|
||||
"buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
|
||||
"reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
|
||||
"reportName": "Allure Report"
|
||||
}
|
||||
EOF
|
||||
|
||||
cat <<EOF > ${REPORT_DIR}/environment.properties
|
||||
COMMIT_SHA=${COMMIT_SHA}
|
||||
EOF
|
||||
|
||||
ARCHIVE="${UNIQUE_KEY}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
|
||||
ZSTD_NBTHREADS=0
|
||||
|
||||
time tar -C ${REPORT_DIR} -cf ${ARCHIVE} --zstd .
|
||||
time aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
|
||||
env:
|
||||
UNIQUE_KEY: ${{ inputs.unique-key }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUCKET: neon-github-public-dev
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
rm -rf ${REPORT_DIR}
|
||||
254
.github/actions/allure-report/action.yml
vendored
Normal file
254
.github/actions/allure-report/action.yml
vendored
Normal file
@@ -0,0 +1,254 @@
|
||||
name: 'Create Allure report'
|
||||
description: 'Create and publish Allure report'
|
||||
|
||||
inputs:
|
||||
action:
|
||||
desctiption: 'generate or store'
|
||||
required: true
|
||||
build_type:
|
||||
description: '`build_type` from run-python-test-set action'
|
||||
required: true
|
||||
test_selection:
|
||||
description: '`test_selector` from run-python-test-set action'
|
||||
required: false
|
||||
outputs:
|
||||
report-url:
|
||||
description: 'Allure report URL'
|
||||
value: ${{ steps.generate-report.outputs.report-url }}
|
||||
report-json-url:
|
||||
description: 'Allure report JSON URL'
|
||||
value: ${{ steps.generate-report.outputs.report-json-url }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
|
||||
steps:
|
||||
# We're using some of env variables quite offen, so let's set them once.
|
||||
#
|
||||
# It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
|
||||
#
|
||||
# - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
|
||||
# - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
|
||||
#
|
||||
- name: Set common environment variables
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
echo "BUILD_TYPE=${BUILD_TYPE}" >> $GITHUB_ENV
|
||||
echo "BUCKET=${BUCKET}" >> $GITHUB_ENV
|
||||
echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV
|
||||
env:
|
||||
BUILD_TYPE: ${{ inputs.build_type }}
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
|
||||
- name: Validate input parameters
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then
|
||||
echo >&2 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then
|
||||
echo >&2 "inputs.test_selection must be set for 'store' action"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
- name: Calculate variables
|
||||
id: calculate-vars
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key
|
||||
|
||||
pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${pr_number}" != "null" ]; then
|
||||
key=pr-${pr_number}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ]; then
|
||||
# Shortcut for a special branch
|
||||
key=main
|
||||
elif [ "${GITHUB_REF_NAME}" = "release" ]; then
|
||||
# Shortcut for a special branch
|
||||
key=release
|
||||
else
|
||||
key=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
|
||||
fi
|
||||
echo "KEY=${key}" >> $GITHUB_OUTPUT
|
||||
|
||||
# Sanitize test selection to remove `/` and any other special characters
|
||||
# Use printf instead of echo to avoid having `\n` at the end of the string
|
||||
test_selection=$(printf "${{ inputs.test_selection }}" | tr -c "[:alnum:]._-" "-" )
|
||||
echo "TEST_SELECTION=${test_selection}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/setup-java@v3
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
with:
|
||||
distribution: 'temurin'
|
||||
java-version: '17'
|
||||
|
||||
- name: Install Allure
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
if ! which allure; then
|
||||
ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
|
||||
wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
|
||||
echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c
|
||||
unzip -q ${ALLURE_ZIP}
|
||||
echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
|
||||
rm -f ${ALLURE_ZIP}
|
||||
fi
|
||||
env:
|
||||
ALLURE_VERSION: 2.21.0
|
||||
ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c
|
||||
|
||||
- name: Upload Allure results
|
||||
if: ${{ inputs.action == 'store' }}
|
||||
env:
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Add metadata
|
||||
cat <<EOF > $TEST_OUTPUT/allure/results/executor.json
|
||||
{
|
||||
"name": "GitHub Actions",
|
||||
"type": "github",
|
||||
"url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
|
||||
"buildOrder": ${GITHUB_RUN_ID},
|
||||
"buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}",
|
||||
"buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
|
||||
"reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
|
||||
"reportName": "Allure Report"
|
||||
}
|
||||
EOF
|
||||
cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
|
||||
TEST_SELECTION=${{ inputs.test_selection }}
|
||||
BUILD_TYPE=${BUILD_TYPE}
|
||||
EOF
|
||||
|
||||
ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
|
||||
ZSTD_NBTHREADS=0
|
||||
|
||||
tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
|
||||
aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
|
||||
|
||||
# Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
|
||||
- name: Acquire Allure lock
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
run: |
|
||||
LOCK_TIMEOUT=300 # seconds
|
||||
|
||||
for _ in $(seq 1 5); do
|
||||
for i in $(seq 1 ${LOCK_TIMEOUT}); do
|
||||
LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
|
||||
# `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
|
||||
if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" > lock.txt
|
||||
aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}"
|
||||
|
||||
# A double-check that exactly WE have acquired the lock
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
|
||||
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Generate and publish final Allure report
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
id: generate-report
|
||||
env:
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Get previously uploaded data for this run
|
||||
ZSTD_NBTHREADS=0
|
||||
|
||||
s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output '.Contents[].Key')
|
||||
if [ -z "$s3_filepaths" ]; then
|
||||
# There's no previously uploaded data for this run
|
||||
exit 0
|
||||
fi
|
||||
for s3_filepath in ${s3_filepaths}; do
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/"
|
||||
|
||||
archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath)
|
||||
mkdir -p ${archive%.tar.zst}
|
||||
tar -xf ${archive} -C ${archive%.tar.zst}
|
||||
rm -f ${archive}
|
||||
done
|
||||
|
||||
# Get history trend
|
||||
aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true
|
||||
|
||||
# Generate report
|
||||
allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/*
|
||||
|
||||
# Replace a logo link with a redirect to the latest version of the report
|
||||
sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html" class=|g' $TEST_OUTPUT/allure/report/app.js
|
||||
|
||||
# Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
|
||||
aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
|
||||
aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
||||
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
|
||||
|
||||
# Generate redirect
|
||||
cat <<EOF > ${TEST_OUTPUT}/allure/index.html
|
||||
<!DOCTYPE html>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<title>Redirecting to ${REPORT_URL}</title>
|
||||
<meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
|
||||
EOF
|
||||
aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
||||
|
||||
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Release Allure lock
|
||||
if: ${{ inputs.action == 'generate' && always() }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
run: |
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
|
||||
|
||||
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
|
||||
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
|
||||
fi
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
rm -rf ${TEST_OUTPUT}/allure
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: ${{ inputs.action == 'generate' && always() }}
|
||||
env:
|
||||
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
with:
|
||||
script: |
|
||||
const { REPORT_URL, BUILD_TYPE, SHA } = process.env
|
||||
|
||||
await github.rest.repos.createCommitStatus({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
sha: `${SHA}`,
|
||||
state: 'success',
|
||||
target_url: `${REPORT_URL}`,
|
||||
context: `Allure report / ${BUILD_TYPE}`,
|
||||
})
|
||||
13
.github/actions/run-python-test-set/action.yml
vendored
13
.github/actions/run-python-test-set/action.yml
vendored
@@ -197,13 +197,14 @@ runs:
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
|
||||
# Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
|
||||
path: /tmp/test_output/compatibility_snapshot_pg14/
|
||||
# The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
|
||||
path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
|
||||
prefix: latest
|
||||
|
||||
- name: Upload test results
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-store
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
report-dir: /tmp/test_output/allure/results
|
||||
unique-key: ${{ inputs.build_type }}
|
||||
action: store
|
||||
build_type: ${{ inputs.build_type }}
|
||||
test_selection: ${{ inputs.test_selection }}
|
||||
|
||||
5
.github/ansible/.gitignore
vendored
Normal file
5
.github/ansible/.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
neon_install.tar.gz
|
||||
.neon_current_version
|
||||
|
||||
collections/*
|
||||
!collections/.keep
|
||||
12
.github/ansible/ansible.cfg
vendored
Normal file
12
.github/ansible/ansible.cfg
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
[defaults]
|
||||
|
||||
localhost_warning = False
|
||||
host_key_checking = False
|
||||
timeout = 30
|
||||
|
||||
[ssh_connection]
|
||||
ssh_args = -F ./ansible.ssh.cfg
|
||||
# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127
|
||||
# and scp neither worked for me
|
||||
transfer_method = piped
|
||||
pipelining = True
|
||||
15
.github/ansible/ansible.ssh.cfg
vendored
Normal file
15
.github/ansible/ansible.ssh.cfg
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed
|
||||
# (use pre 8.5 option name to cope with old ssh in CI)
|
||||
PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com
|
||||
|
||||
Host tele.zenith.tech
|
||||
User admin
|
||||
Port 3023
|
||||
StrictHostKeyChecking no
|
||||
UserKnownHostsFile /dev/null
|
||||
|
||||
Host * !tele.zenith.tech
|
||||
User admin
|
||||
StrictHostKeyChecking no
|
||||
UserKnownHostsFile /dev/null
|
||||
ProxyJump tele.zenith.tech
|
||||
0
.github/ansible/collections/.keep
vendored
Normal file
0
.github/ansible/collections/.keep
vendored
Normal file
211
.github/ansible/deploy.yaml
vendored
Normal file
211
.github/ansible/deploy.yaml
vendored
Normal file
@@ -0,0 +1,211 @@
|
||||
- name: Upload Neon binaries
|
||||
hosts: storage
|
||||
gather_facts: False
|
||||
remote_user: "{{ remote_user }}"
|
||||
|
||||
tasks:
|
||||
|
||||
- name: get latest version of Neon binaries
|
||||
register: current_version_file
|
||||
set_fact:
|
||||
current_version: "{{ lookup('file', '.neon_current_version') | trim }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: inform about versions
|
||||
debug:
|
||||
msg: "Version to deploy - {{ current_version }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: upload and extract Neon binaries to /usr/local
|
||||
ansible.builtin.unarchive:
|
||||
owner: root
|
||||
group: root
|
||||
src: neon_install.tar.gz
|
||||
dest: /usr/local
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
- binaries
|
||||
- putbinaries
|
||||
|
||||
- name: Deploy pageserver
|
||||
hosts: pageservers
|
||||
gather_facts: False
|
||||
remote_user: "{{ remote_user }}"
|
||||
|
||||
tasks:
|
||||
|
||||
- name: upload init script
|
||||
when: console_mgmt_base_url is defined
|
||||
ansible.builtin.template:
|
||||
src: scripts/init_pageserver.sh
|
||||
dest: /tmp/init_pageserver.sh
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: init pageserver
|
||||
shell:
|
||||
cmd: /tmp/init_pageserver.sh
|
||||
args:
|
||||
creates: "/storage/pageserver/data/tenants"
|
||||
environment:
|
||||
NEON_REPO_DIR: "/storage/pageserver/data"
|
||||
LD_LIBRARY_PATH: "/usr/local/v14/lib"
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: read the existing remote pageserver config
|
||||
ansible.builtin.slurp:
|
||||
src: /storage/pageserver/data/pageserver.toml
|
||||
register: _remote_ps_config
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: parse the existing pageserver configuration
|
||||
ansible.builtin.set_fact:
|
||||
_existing_ps_config: "{{ _remote_ps_config['content'] | b64decode | sivel.toiletwater.from_toml }}"
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: construct the final pageserver configuration dict
|
||||
ansible.builtin.set_fact:
|
||||
pageserver_config: "{{ pageserver_config_stub | combine({'id': _existing_ps_config.id }) }}"
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: template the pageserver config
|
||||
template:
|
||||
src: templates/pageserver.toml.j2
|
||||
dest: /storage/pageserver/data/pageserver.toml
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
# used in `pageserver.service` template
|
||||
- name: learn current availability_zone
|
||||
shell:
|
||||
cmd: "curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone"
|
||||
register: ec2_availability_zone
|
||||
|
||||
- set_fact:
|
||||
ec2_availability_zone={{ ec2_availability_zone.stdout }}
|
||||
|
||||
- name: upload systemd service definition
|
||||
ansible.builtin.template:
|
||||
src: systemd/pageserver.service
|
||||
dest: /etc/systemd/system/pageserver.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: start systemd service
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: yes
|
||||
name: pageserver
|
||||
enabled: yes
|
||||
state: restarted
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: post version to console
|
||||
when: console_mgmt_base_url is defined
|
||||
shell:
|
||||
cmd: |
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
|
||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: Deploy safekeeper
|
||||
hosts: safekeepers
|
||||
gather_facts: False
|
||||
remote_user: "{{ remote_user }}"
|
||||
|
||||
tasks:
|
||||
|
||||
- name: upload init script
|
||||
when: console_mgmt_base_url is defined
|
||||
ansible.builtin.template:
|
||||
src: scripts/init_safekeeper.sh
|
||||
dest: /tmp/init_safekeeper.sh
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
become: true
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: init safekeeper
|
||||
shell:
|
||||
cmd: /tmp/init_safekeeper.sh
|
||||
args:
|
||||
creates: "/storage/safekeeper/data/safekeeper.id"
|
||||
environment:
|
||||
NEON_REPO_DIR: "/storage/safekeeper/data"
|
||||
LD_LIBRARY_PATH: "/usr/local/v14/lib"
|
||||
become: true
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
# used in `safekeeper.service` template
|
||||
- name: learn current availability_zone
|
||||
shell:
|
||||
cmd: "curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone"
|
||||
register: ec2_availability_zone
|
||||
|
||||
- set_fact:
|
||||
ec2_availability_zone={{ ec2_availability_zone.stdout }}
|
||||
|
||||
# in the future safekeepers should discover pageservers byself
|
||||
# but currently use first pageserver that was discovered
|
||||
- name: set first pageserver var for safekeepers
|
||||
set_fact:
|
||||
first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}"
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: upload systemd service definition
|
||||
ansible.builtin.template:
|
||||
src: systemd/safekeeper.service
|
||||
dest: /etc/systemd/system/safekeeper.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
become: true
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: start systemd service
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: yes
|
||||
name: safekeeper
|
||||
enabled: yes
|
||||
state: restarted
|
||||
become: true
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: post version to console
|
||||
when: console_mgmt_base_url is defined
|
||||
shell:
|
||||
cmd: |
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
|
||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
|
||||
tags:
|
||||
- safekeeper
|
||||
42
.github/ansible/get_binaries.sh
vendored
Executable file
42
.github/ansible/get_binaries.sh
vendored
Executable file
@@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
if [ -n "${DOCKER_TAG}" ]; then
|
||||
# Verson is DOCKER_TAG but without prefix
|
||||
VERSION=$(echo $DOCKER_TAG | sed 's/^.*-//g')
|
||||
else
|
||||
echo "Please set DOCKER_TAG environment variable"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
# do initial cleanup
|
||||
rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version
|
||||
mkdir neon_install
|
||||
|
||||
# retrieve binaries from docker image
|
||||
echo "getting binaries from docker image"
|
||||
docker pull --quiet neondatabase/neon:${DOCKER_TAG}
|
||||
ID=$(docker create neondatabase/neon:${DOCKER_TAG})
|
||||
docker cp ${ID}:/data/postgres_install.tar.gz .
|
||||
tar -xzf postgres_install.tar.gz -C neon_install
|
||||
mkdir neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/pageserver_binutils neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/storage_broker neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/
|
||||
docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/
|
||||
docker cp ${ID}:/usr/local/v14/lib/ neon_install/v14/lib/
|
||||
docker cp ${ID}:/usr/local/v15/lib/ neon_install/v15/lib/
|
||||
docker rm -vf ${ID}
|
||||
|
||||
# store version to file (for ansible playbooks) and create binaries tarball
|
||||
echo ${VERSION} > neon_install/.neon_current_version
|
||||
echo ${VERSION} > .neon_current_version
|
||||
tar -czf neon_install.tar.gz -C neon_install .
|
||||
|
||||
# do final cleaup
|
||||
rm -rf neon_install postgres_install.tar.gz
|
||||
48
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
Normal file
48
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-ap-southeast-1
|
||||
bucket_region: ap-southeast-1
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: ap-southeast-1
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
|
||||
console_region_id: aws-ap-southeast-1
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-064de8ea28bdb495b
|
||||
pageserver-1.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0b180defcaeeb6b93
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0d6f1dc5161eef894
|
||||
safekeeper-2.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-04fb63634e4679eb9
|
||||
safekeeper-3.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-05481f3bc88cfc2d4
|
||||
50
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
Normal file
50
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-eu-central-1
|
||||
bucket_region: eu-central-1
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: eu-central-1
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
|
||||
console_region_id: aws-eu-central-1
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-0cd8d316ecbb715be
|
||||
pageserver-1.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-090044ed3d383fef0
|
||||
pageserver-2.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-033584edf3f4b6742
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-0b238612d2318a050
|
||||
safekeeper-1.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-07b9c45e5c2637cd4
|
||||
safekeeper-2.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-020257302c3c93d88
|
||||
50
.github/ansible/prod.us-east-1.hosts.yaml
vendored
Normal file
50
.github/ansible/prod.us-east-1.hosts.yaml
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-east-1
|
||||
bucket_region: us-east-1
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.theta.us-east-1.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-east-1
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-1
|
||||
console_region_id: aws-us-east-1
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.us-east-1.aws.neon.tech:
|
||||
ansible_host: i-085222088b0d2e0c7
|
||||
pageserver-1.us-east-1.aws.neon.tech:
|
||||
ansible_host: i-0969d4f684d23a21e
|
||||
pageserver-2.us-east-1.aws.neon.tech:
|
||||
ansible_host: i-05dee87895da58dad
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.us-east-1.aws.neon.tech:
|
||||
ansible_host: i-04ce739e88793d864
|
||||
safekeeper-1.us-east-1.aws.neon.tech:
|
||||
ansible_host: i-0e9e6c9227fb81410
|
||||
safekeeper-2.us-east-1.aws.neon.tech:
|
||||
ansible_host: i-072f4dd86a327d52f
|
||||
51
.github/ansible/prod.us-east-2.hosts.yaml
vendored
Normal file
51
.github/ansible/prod.us-east-2.hosts.yaml
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-east-2
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
|
||||
console_region_id: aws-us-east-2
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-062227ba7f119eb8c
|
||||
pageserver-1.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-0b3ec0afab5968938
|
||||
pageserver-2.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-0d7a1c4325e71421d
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-0e94224750c57d346
|
||||
safekeeper-1.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-06d113fb73bfddeb0
|
||||
safekeeper-2.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-09f66c8e04afff2e8
|
||||
|
||||
72
.github/ansible/prod.us-west-2.hosts.yaml
vendored
Normal file
72
.github/ansible/prod.us-west-2.hosts.yaml
vendored
Normal file
@@ -0,0 +1,72 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-west-2
|
||||
bucket_region: us-west-2
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-west-2
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-us-west-2
|
||||
console_region_id: aws-us-west-2-new
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-0d9f6dfae0e1c780d
|
||||
pageserver-1.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-0c834be1dddba8b3f
|
||||
pageserver-2.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-051642d372c0a4f32
|
||||
pageserver-3.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-00c3844beb9ad1c6b
|
||||
pageserver-4.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-013263dd1c239adcc
|
||||
pageserver-5.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-00ca6417c7bf96820
|
||||
pageserver-6.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-01cdf7d2bc1433b6a
|
||||
pageserver-7.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-02eec9b40617db5bc
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-00719d8a74986fda6
|
||||
safekeeper-1.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-074682f9d3c712e7c
|
||||
safekeeper-2.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-042b7efb1729d7966
|
||||
safekeeper-3.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-089f6b9ef426dff76
|
||||
safekeeper-4.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-0fe6bf912c4710c82
|
||||
safekeeper-5.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-0a83c1c46d2b4e409
|
||||
safekeeper-6.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-0fef5317b8fdc9f8d
|
||||
safekeeper-7.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-0be739190d4289bf9
|
||||
safekeeper-8.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-00e851803669e5cfe
|
||||
37
.github/ansible/scripts/init_pageserver.sh
vendored
Normal file
37
.github/ansible/scripts/init_pageserver.sh
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/bin/sh
|
||||
|
||||
# fetch params from meta-data service
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)
|
||||
INSTANCE_TYPE=$(curl -s http://169.254.169.254/latest/meta-data/instance-type)
|
||||
DISK_SIZE=$(df -B1 /storage | tail -1 | awk '{print $2}')
|
||||
|
||||
# store fqdn hostname in var
|
||||
HOST=$(hostname -f)
|
||||
|
||||
|
||||
cat <<EOF | tee /tmp/payload
|
||||
{
|
||||
"version": 1,
|
||||
"host": "${HOST}",
|
||||
"port": 6400,
|
||||
"region_id": "{{ console_region_id }}",
|
||||
"instance_id": "${INSTANCE_ID}",
|
||||
"http_host": "${HOST}",
|
||||
"http_port": 9898,
|
||||
"active": false,
|
||||
"availability_zone_id": "${AZ_ID}",
|
||||
"disk_size": ${DISK_SIZE},
|
||||
"instance_type": "${INSTANCE_TYPE}"
|
||||
}
|
||||
EOF
|
||||
|
||||
# check if pageserver already registered or not
|
||||
if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/${INSTANCE_ID} -o /dev/null; then
|
||||
|
||||
# not registered, so register it now
|
||||
ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" {{ console_mgmt_base_url }}/management/api/v2/pageservers -d@/tmp/payload | jq -r '.id')
|
||||
|
||||
# init pageserver
|
||||
sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
|
||||
fi
|
||||
31
.github/ansible/scripts/init_safekeeper.sh
vendored
Normal file
31
.github/ansible/scripts/init_safekeeper.sh
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
#!/bin/sh
|
||||
|
||||
# fetch params from meta-data service
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)
|
||||
|
||||
# store fqdn hostname in var
|
||||
HOST=$(hostname -f)
|
||||
|
||||
|
||||
cat <<EOF | tee /tmp/payload
|
||||
{
|
||||
"version": 1,
|
||||
"host": "${HOST}",
|
||||
"port": 6500,
|
||||
"http_port": 7676,
|
||||
"region_id": "{{ console_region_id }}",
|
||||
"instance_id": "${INSTANCE_ID}",
|
||||
"availability_zone_id": "${AZ_ID}",
|
||||
"active": false
|
||||
}
|
||||
EOF
|
||||
|
||||
# check if safekeeper already registered or not
|
||||
if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/${INSTANCE_ID} -o /dev/null; then
|
||||
|
||||
# not registered, so register it now
|
||||
ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" {{ console_mgmt_base_url }}/management/api/v2/safekeepers -d@/tmp/payload | jq -r '.id')
|
||||
# init safekeeper
|
||||
sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data
|
||||
fi
|
||||
2
.github/ansible/ssm_config
vendored
Normal file
2
.github/ansible/ssm_config
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
ansible_connection: aws_ssm
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
47
.github/ansible/staging.eu-central-1.hosts.yaml
vendored
Normal file
47
.github/ansible/staging.eu-central-1.hosts.yaml
vendored
Normal file
@@ -0,0 +1,47 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-dev-storage-eu-central-1
|
||||
bucket_region: eu-central-1
|
||||
# We only register/update storage in one preview console and manually copy to other instances
|
||||
console_mgmt_base_url: http://neon-internal-api.helium.aws.neon.build
|
||||
broker_endpoint: http://storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.helium.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: &default_eviction_threshold "20m"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: eu-central-1
|
||||
ansible_aws_ssm_bucket_name: neon-dev-storage-eu-central-1
|
||||
console_region_id: aws-eu-central-1
|
||||
sentry_environment: staging
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.eu-central-1.aws.neon.build:
|
||||
ansible_host: i-011f93ec26cfba2d4
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.eu-central-1.aws.neon.build:
|
||||
ansible_host: i-0ff026d27babf8ddd
|
||||
safekeeper-1.eu-central-1.aws.neon.build:
|
||||
ansible_host: i-03983a49ee54725d9
|
||||
safekeeper-2.eu-central-1.aws.neon.build:
|
||||
ansible_host: i-0bd025ecdb61b0db3
|
||||
60
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
Normal file
60
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-dev-storage-eu-west-1
|
||||
bucket_region: eu-west-1
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.build
|
||||
broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: &default_eviction_threshold "20m"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: eu-west-1
|
||||
ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
|
||||
console_region_id: aws-eu-west-1
|
||||
sentry_environment: staging
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-01d496c5041c7f34c
|
||||
pageserver-1.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-0e8013e239ce3928c
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-05226ef85722831bf
|
||||
safekeeper-1.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-06969ee1bf2958bfc
|
||||
safekeeper-2.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-087892e9625984a0b
|
||||
safekeeper-3.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-0a6f91660e99e8891
|
||||
safekeeper-4.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-0012e309e28e7c249
|
||||
safekeeper-5.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-085a2b1193287b32e
|
||||
safekeeper-6.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-0c713248465ed0fbd
|
||||
safekeeper-7.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-02ad231aed2a80b7a
|
||||
safekeeper-8.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-0dbbd8ffef66efda8
|
||||
56
.github/ansible/staging.us-east-2.hosts.yaml
vendored
Normal file
56
.github/ansible/staging.us-east-2.hosts.yaml
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-staging-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.build
|
||||
broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: &default_eviction_threshold "20m"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-east-2
|
||||
ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
|
||||
console_region_id: aws-us-east-2
|
||||
sentry_environment: staging
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0c3e70929edb5d691
|
||||
pageserver-1.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0565a8b4008aa3f40
|
||||
pageserver-2.us-east-2.aws.neon.build:
|
||||
ansible_host: i-01e31cdf7e970586a
|
||||
pageserver-3.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0602a0291365ef7cc
|
||||
pageserver-99.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0c39491109bb88824
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.us-east-2.aws.neon.build:
|
||||
ansible_host: i-027662bd552bf5db0
|
||||
safekeeper-2.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0de0b03a51676a6ce
|
||||
safekeeper-3.us-east-2.aws.neon.build:
|
||||
ansible_host: i-05f8ba2cda243bd18
|
||||
safekeeper-99.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0d61b6a2ea32028d5
|
||||
18
.github/ansible/systemd/pageserver.service
vendored
Normal file
18
.github/ansible/systemd/pageserver.service
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=Neon pageserver
|
||||
After=network.target auditd.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=pageserver
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -c "availability_zone='{{ ec2_availability_zone }}'" -D /storage/pageserver/data
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
Restart=on-failure
|
||||
TimeoutSec=10
|
||||
LimitNOFILE=30000000
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
18
.github/ansible/systemd/safekeeper.service
vendored
Normal file
18
.github/ansible/systemd/safekeeper.service
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=Neon safekeeper
|
||||
After=network.target auditd.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}' --availability-zone={{ ec2_availability_zone }}
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
Restart=on-failure
|
||||
TimeoutSec=10
|
||||
LimitNOFILE=30000000
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
1
.github/ansible/templates/pageserver.toml.j2
vendored
Normal file
1
.github/ansible/templates/pageserver.toml.j2
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{{ pageserver_config | sivel.toiletwater.to_toml }}
|
||||
52
.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml
vendored
Normal file
52
.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: staging
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "staging"
|
||||
19
.github/helm-values/dev-eu-central-1-alpha.pg-sni-router.yaml
vendored
Normal file
19
.github/helm-values/dev-eu-central-1-alpha.pg-sni-router.yaml
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
useCertManager: true
|
||||
|
||||
replicaCount: 3
|
||||
|
||||
exposedService:
|
||||
# exposedService.port -- Exposed Service proxy port
|
||||
port: 4432
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/hostname: "*.snirouter.alpha.eu-central-1.internal.aws.neon.build"
|
||||
|
||||
settings:
|
||||
domain: "*.snirouter.alpha.eu-central-1.internal.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
|
||||
imagePullSecrets:
|
||||
- name: docker-hub-neon
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
76
.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
vendored
Normal file
76
.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,76 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 5 minutes (5 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 300"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.eu-west-1.aws.neon.build"
|
||||
otelExporterOtlpEndpoint: "https://otel-collector.zeta.eu-west-1.internal.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: dev
|
||||
neon_region: eu-west-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
52
.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
vendored
Normal file
52
.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: staging
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "staging"
|
||||
19
.github/helm-values/dev-eu-west-1-zeta.pg-sni-router.yaml
vendored
Normal file
19
.github/helm-values/dev-eu-west-1-zeta.pg-sni-router.yaml
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
useCertManager: true
|
||||
|
||||
replicaCount: 3
|
||||
|
||||
exposedService:
|
||||
# exposedService.port -- Exposed Service proxy port
|
||||
port: 4432
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/hostname: "*.snirouter.zeta.eu-west-1.internal.aws.neon.build"
|
||||
|
||||
settings:
|
||||
domain: "*.snirouter.zeta.eu-west-1.internal.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
|
||||
imagePullSecrets:
|
||||
- name: docker-hub-neon
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
68
.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
vendored
Normal file
68
.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
# Helm chart values for neon-proxy-link.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "link"
|
||||
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
domain: "pg.neon.build"
|
||||
otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy-link pods
|
||||
podLabels:
|
||||
neon_service: proxy
|
||||
neon_env: dev
|
||||
neon_region: us-east-2
|
||||
|
||||
service:
|
||||
type: LoadBalancer
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
|
||||
external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.beta.us-east-2.aws.neon.build
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.beta.us-east-2.aws.neon.build
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
77
.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
vendored
Normal file
77
.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 5 minutes (5 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 300"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram-legacy
|
||||
neon_env: dev
|
||||
neon_region: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
78
.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
vendored
Normal file
78
.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,78 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 5 minutes (5 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 300"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.build"
|
||||
extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"]
|
||||
otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: dev
|
||||
neon_region: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
52
.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
vendored
Normal file
52
.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: staging
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.beta.us-east-2.internal.aws.neon.build
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "staging"
|
||||
19
.github/helm-values/dev-us-east-2-beta.pg-sni-router.yaml
vendored
Normal file
19
.github/helm-values/dev-us-east-2-beta.pg-sni-router.yaml
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
useCertManager: true
|
||||
|
||||
replicaCount: 3
|
||||
|
||||
exposedService:
|
||||
# exposedService.port -- Exposed Service proxy port
|
||||
port: 4432
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/hostname: "*.snirouter.beta.us-east-2.internal.aws.neon.build"
|
||||
|
||||
settings:
|
||||
domain: "*.snirouter.beta.us-east-2.internal.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
|
||||
imagePullSecrets:
|
||||
- name: docker-hub-neon
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
67
.github/helm-values/preview-template.neon-proxy-scram.yaml
vendored
Normal file
67
.github/helm-values/preview-template.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/management/api/v2"
|
||||
domain: "*.cloud.${PREVIEW_NAME}.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: test
|
||||
neon_region: ${PREVIEW_NAME}.eu-central-1
|
||||
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: cloud.${PREVIEW_NAME}.aws.neon.build
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
77
.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
vendored
Normal file
77
.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 5 minutes (5 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 300"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||
extraDomains: ["*.ap-southeast-1.retooldb.com", "*.ap-southeast-1.postgres.vercel-storage.com"]
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: ap-southeast-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
52
.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
vendored
Normal file
52
.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
19
.github/helm-values/prod-ap-southeast-1-epsilon.pg-sni-router.yaml
vendored
Normal file
19
.github/helm-values/prod-ap-southeast-1-epsilon.pg-sni-router.yaml
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
useCertManager: true
|
||||
|
||||
replicaCount: 3
|
||||
|
||||
exposedService:
|
||||
# exposedService.port -- Exposed Service proxy port
|
||||
port: 4432
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/hostname: "*.snirouter.epsilon.ap-southeast-1.internal.aws.neon.tech"
|
||||
|
||||
settings:
|
||||
domain: "*.snirouter.epsilon.ap-southeast-1.internal.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
|
||||
imagePullSecrets:
|
||||
- name: docker-hub-neon
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
77
.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
vendored
Normal file
77
.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 5 minutes (5 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 300"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.eu-central-1.aws.neon.tech"
|
||||
extraDomains: ["*.eu-central-1.retooldb.com", "*.eu-central-1.postgres.vercel-storage.com"]
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: eu-central-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
52
.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
vendored
Normal file
52
.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
19
.github/helm-values/prod-eu-central-1-gamma.pg-sni-router.yaml
vendored
Normal file
19
.github/helm-values/prod-eu-central-1-gamma.pg-sni-router.yaml
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
useCertManager: true
|
||||
|
||||
replicaCount: 3
|
||||
|
||||
exposedService:
|
||||
# exposedService.port -- Exposed Service proxy port
|
||||
port: 4432
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/hostname: "*.snirouter.gamma.eu-central-1.internal.aws.neon.tech"
|
||||
|
||||
settings:
|
||||
domain: "*.snirouter.gamma.eu-central-1.internal.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
|
||||
imagePullSecrets:
|
||||
- name: docker-hub-neon
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
69
.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml
vendored
Normal file
69
.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 5 minutes (5 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 300"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-east-1.aws.neon.tech"
|
||||
# *.us-east-1.retooldb.com hasn't been delegated yet.
|
||||
extraDomains: ["*.us-east-1.postgres.vercel-storage.com"]
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: us-east-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-east-1.aws.neon.tech
|
||||
httpsPort: 443
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
52
.github/helm-values/prod-us-east-1-theta.neon-storage-broker.yaml
vendored
Normal file
52
.github/helm-values/prod-us-east-1-theta.neon-storage-broker.yaml
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.theta.us-east-1.internal.aws.neon.tech
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
19
.github/helm-values/prod-us-east-1-theta.pg-sni-router.yaml
vendored
Normal file
19
.github/helm-values/prod-us-east-1-theta.pg-sni-router.yaml
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
useCertManager: true
|
||||
|
||||
replicaCount: 3
|
||||
|
||||
exposedService:
|
||||
# exposedService.port -- Exposed Service proxy port
|
||||
port: 4432
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/hostname: "*.snirouter.theta.us-east-1.internal.aws.neon.tech"
|
||||
|
||||
settings:
|
||||
domain: "*.snirouter.theta.us-east-1.internal.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
|
||||
imagePullSecrets:
|
||||
- name: docker-hub-neon
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
58
.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
vendored
Normal file
58
.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
# Helm chart values for neon-proxy-link.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "link"
|
||||
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.neon.tech/psql_session/"
|
||||
domain: "pg.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy
|
||||
neon_env: production
|
||||
neon_region: us-east-2
|
||||
|
||||
service:
|
||||
type: LoadBalancer
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
|
||||
external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.delta.us-east-2.aws.neon.tech
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.delta.us-east-2.aws.neon.tech
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
77
.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
vendored
Normal file
77
.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 5 minutes (5 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 300"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.tech"
|
||||
extraDomains: ["*.us-east-2.retooldb.com", "*.us-east-2.postgres.vercel-storage.com"]
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
52
.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
vendored
Normal file
52
.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.delta.us-east-2.internal.aws.neon.tech
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
19
.github/helm-values/prod-us-east-2-delta.pg-sni-router.yaml
vendored
Normal file
19
.github/helm-values/prod-us-east-2-delta.pg-sni-router.yaml
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
useCertManager: true
|
||||
|
||||
replicaCount: 3
|
||||
|
||||
exposedService:
|
||||
# exposedService.port -- Exposed Service proxy port
|
||||
port: 4432
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/hostname: "*.snirouter.delta.us-east-2.internal.aws.neon.tech"
|
||||
|
||||
settings:
|
||||
domain: "*.snirouter.delta.us-east-2.internal.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
|
||||
imagePullSecrets:
|
||||
- name: docker-hub-neon
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
76
.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
vendored
Normal file
76
.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
vendored
Normal file
@@ -0,0 +1,76 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 5 minutes (5 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 300"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.cloud.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: us-west-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.eta.us-west-2.aws.neon.tech
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
77
.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
vendored
Normal file
77
.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 5 minutes (5 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 300"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-west-2.aws.neon.tech"
|
||||
extraDomains: ["*.us-west-2.retooldb.com", "*.us-west-2.postgres.vercel-storage.com"]
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: us-west-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
52
.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
vendored
Normal file
52
.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.eta.us-west-2.internal.aws.neon.tech
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
19
.github/helm-values/prod-us-west-2-eta.pg-sni-router.yaml
vendored
Normal file
19
.github/helm-values/prod-us-west-2-eta.pg-sni-router.yaml
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
useCertManager: true
|
||||
|
||||
replicaCount: 3
|
||||
|
||||
exposedService:
|
||||
# exposedService.port -- Exposed Service proxy port
|
||||
port: 4432
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/hostname: "*.snirouter.eta.us-west-2.internal.aws.neon.tech"
|
||||
|
||||
settings:
|
||||
domain: "*.snirouter.eta.us-west-2.internal.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
|
||||
imagePullSecrets:
|
||||
- name: docker-hub-neon
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
27
.github/workflows/benchmarking.yml
vendored
27
.github/workflows/benchmarking.yml
vendored
@@ -93,7 +93,10 @@ jobs:
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
@@ -280,7 +283,10 @@ jobs:
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
@@ -374,7 +380,10 @@ jobs:
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
@@ -467,7 +476,10 @@ jobs:
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
@@ -554,13 +566,16 @@ jobs:
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
113
.github/workflows/build_and_test.yml
vendored
113
.github/workflows/build_and_test.yml
vendored
@@ -330,7 +330,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
pg_version: [ v14, v15 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -351,12 +350,11 @@ jobs:
|
||||
real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
|
||||
rerun_flaky: true
|
||||
env:
|
||||
DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
if: matrix.build_type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
benchmarks:
|
||||
@@ -401,10 +399,21 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Create Allure report
|
||||
- name: Create Allure report (debug)
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
id: create-allure-report-debug
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: debug
|
||||
|
||||
- name: Create Allure report (release)
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report-release
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: release
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: >
|
||||
@@ -414,37 +423,52 @@ jobs:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
const report = {
|
||||
reportUrl: "${{ steps.create-allure-report.outputs.report-url }}",
|
||||
reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
|
||||
}
|
||||
const reports = [{
|
||||
buildType: "debug",
|
||||
reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}",
|
||||
jsonUrl: "${{ steps.create-allure-report-debug.outputs.report-json-url }}",
|
||||
}, {
|
||||
buildType: "release",
|
||||
reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}",
|
||||
jsonUrl: "${{ steps.create-allure-report-release.outputs.report-json-url }}",
|
||||
}]
|
||||
|
||||
const script = require("./scripts/pr-comment-test-report.js")
|
||||
await script({
|
||||
github,
|
||||
context,
|
||||
fetch,
|
||||
report,
|
||||
reports,
|
||||
})
|
||||
|
||||
- name: Store Allure test stat in the DB
|
||||
if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
|
||||
if: >
|
||||
!cancelled() && (
|
||||
steps.create-allure-report-debug.outputs.report-url ||
|
||||
steps.create-allure-report-release.outputs.report-url
|
||||
)
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }}
|
||||
REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }}
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
run: |
|
||||
./scripts/pysync
|
||||
|
||||
curl --fail --output suites.json "${REPORT_JSON_URL}"
|
||||
export BUILD_TYPE=unified
|
||||
export DATABASE_URL="$TEST_RESULT_CONNSTR"
|
||||
for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do
|
||||
if [ -z "$report_url" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
poetry run python3 scripts/ingest_regress_test_result.py \
|
||||
--revision ${COMMIT_SHA} \
|
||||
--reference ${GITHUB_REF} \
|
||||
--build-type ${BUILD_TYPE} \
|
||||
--ingest suites.json
|
||||
if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then
|
||||
BUILD_TYPE=debug
|
||||
else
|
||||
BUILD_TYPE=release
|
||||
fi
|
||||
|
||||
curl --fail --output suites.json "${report_url}"
|
||||
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
|
||||
done
|
||||
|
||||
coverage-report:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
@@ -606,7 +630,6 @@ jobs:
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
@@ -692,7 +715,6 @@ jobs:
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
@@ -745,7 +767,6 @@ jobs:
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
@@ -911,6 +932,42 @@ jobs:
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
deploy-pr-test-new:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
needs: [ promote-images, tag, regress-tests ]
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ eu-west-1 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
cd "$(pwd)/.github/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
- name: Cleanup ansible folder
|
||||
run: rm -rf ~/.ansible
|
||||
|
||||
deploy:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
@@ -935,12 +992,12 @@ jobs:
|
||||
|
||||
- name: Trigger deploy workflow
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
gh workflow run deploy-dev.yml --ref main -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
|
||||
gh workflow run deploy-prod.yml --ref release -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
|
||||
280
.github/workflows/deploy-dev.yml
vendored
Normal file
280
.github/workflows/deploy-dev.yml
vendored
Normal file
@@ -0,0 +1,280 @@
|
||||
name: Neon Deploy dev
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dockerTag:
|
||||
description: 'Docker tag to deploy'
|
||||
required: true
|
||||
type: string
|
||||
branch:
|
||||
description: 'Branch or commit used for deploy scripts and configs'
|
||||
required: true
|
||||
type: string
|
||||
default: 'main'
|
||||
deployStorage:
|
||||
description: 'Deploy storage'
|
||||
required: true
|
||||
type: boolean
|
||||
default: true
|
||||
deployProxy:
|
||||
description: 'Deploy proxy'
|
||||
required: true
|
||||
type: boolean
|
||||
default: true
|
||||
deployStorageBroker:
|
||||
description: 'Deploy storage-broker'
|
||||
required: true
|
||||
type: boolean
|
||||
default: true
|
||||
deployPgSniRouter:
|
||||
description: 'Deploy pg-sni-router'
|
||||
required: true
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
concurrency:
|
||||
group: deploy-dev
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
deploy-storage-new:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
options: --user root --privileged
|
||||
if: inputs.deployStorage
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
# TODO(sergey): Fix storage deploy in eu-central-1
|
||||
target_region: [ eu-west-1, us-east-2]
|
||||
environment:
|
||||
name: dev-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
cd "$(pwd)/.github/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook -v deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
- name: Cleanup ansible folder
|
||||
run: rm -rf ~/.ansible
|
||||
|
||||
deploy-proxy-new:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
if: inputs.deployProxy
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: dev-us-east-2-beta
|
||||
deploy_link_proxy: true
|
||||
deploy_legacy_scram_proxy: true
|
||||
- target_region: eu-west-1
|
||||
target_cluster: dev-eu-west-1-zeta
|
||||
deploy_link_proxy: false
|
||||
deploy_legacy_scram_proxy: false
|
||||
environment:
|
||||
name: dev-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Configure AWS Credentials
|
||||
uses: aws-actions/configure-aws-credentials@v1-node16
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::369495373322:role/github-runner
|
||||
aws-region: eu-central-1
|
||||
role-skip-session-tagging: true
|
||||
role-duration-seconds: 1800
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Re-deploy scram proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
- name: Re-deploy link proxy
|
||||
if: matrix.deploy_link_proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
- name: Re-deploy legacy scram proxy
|
||||
if: matrix.deploy_legacy_scram_proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
- name: Cleanup helm folder
|
||||
run: rm -rf ~/.cache
|
||||
|
||||
deploy-preview-proxy-new:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
if: inputs.deployProxy
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: eu-central-1
|
||||
target_cluster: dev-eu-central-1-alpha
|
||||
environment:
|
||||
name: dev-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Configure AWS Credentials
|
||||
uses: aws-actions/configure-aws-credentials@v1-node16
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::369495373322:role/github-runner
|
||||
aws-region: eu-central-1
|
||||
role-skip-session-tagging: true
|
||||
role-duration-seconds: 1800
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Re-deploy preview proxies
|
||||
run: |
|
||||
DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
for PREVIEW_NAME in helium argon krypton xenon radon oganesson hydrogen nitrogen oxygen fluorine chlorine; do
|
||||
export PREVIEW_NAME
|
||||
envsubst <.github/helm-values/preview-template.neon-proxy-scram.yaml >preview-${PREVIEW_NAME}.neon-proxy-scram.yaml
|
||||
helm upgrade neon-proxy-scram-${PREVIEW_NAME} neondatabase/neon-proxy --namespace neon-proxy-${PREVIEW_NAME} --create-namespace --install --atomic -f preview-${PREVIEW_NAME}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
done
|
||||
|
||||
- name: Cleanup helm folder
|
||||
run: rm -rf ~/.cache
|
||||
|
||||
deploy-storage-broker-new:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
if: inputs.deployStorageBroker
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: dev-us-east-2-beta
|
||||
- target_region: eu-west-1
|
||||
target_cluster: dev-eu-west-1-zeta
|
||||
- target_region: eu-central-1
|
||||
target_cluster: dev-eu-central-1-alpha
|
||||
environment:
|
||||
name: dev-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Configure AWS Credentials
|
||||
uses: aws-actions/configure-aws-credentials@v1-node16
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::369495373322:role/github-runner
|
||||
aws-region: eu-central-1
|
||||
role-skip-session-tagging: true
|
||||
role-duration-seconds: 1800
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||
|
||||
- name: Cleanup helm folder
|
||||
run: rm -rf ~/.cache
|
||||
|
||||
deploy-pg-sni-router:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
if: inputs.deployPgSniRouter
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: dev-us-east-2-beta
|
||||
- target_region: eu-west-1
|
||||
target_cluster: dev-eu-west-1-zeta
|
||||
- target_region: eu-central-1
|
||||
target_cluster: dev-eu-central-1-alpha
|
||||
environment:
|
||||
name: dev-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Configure AWS Credentials
|
||||
uses: aws-actions/configure-aws-credentials@v1-node16
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::369495373322:role/github-runner
|
||||
aws-region: eu-central-1
|
||||
role-skip-session-tagging: true
|
||||
role-duration-seconds: 1800
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Deploy pg-sni-router
|
||||
run:
|
||||
helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s
|
||||
|
||||
- name: Cleanup helm folder
|
||||
run: rm -rf ~/.cache
|
||||
217
.github/workflows/deploy-prod.yml
vendored
Normal file
217
.github/workflows/deploy-prod.yml
vendored
Normal file
@@ -0,0 +1,217 @@
|
||||
name: Neon Deploy prod
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dockerTag:
|
||||
description: 'Docker tag to deploy'
|
||||
required: true
|
||||
type: string
|
||||
branch:
|
||||
description: 'Branch or commit used for deploy scripts and configs'
|
||||
required: true
|
||||
type: string
|
||||
default: 'release'
|
||||
deployStorage:
|
||||
description: 'Deploy storage'
|
||||
required: true
|
||||
type: boolean
|
||||
default: true
|
||||
deployProxy:
|
||||
description: 'Deploy proxy'
|
||||
required: true
|
||||
type: boolean
|
||||
default: true
|
||||
deployStorageBroker:
|
||||
description: 'Deploy storage-broker'
|
||||
required: true
|
||||
type: boolean
|
||||
default: true
|
||||
deployPgSniRouter:
|
||||
description: 'Deploy pg-sni-router'
|
||||
required: true
|
||||
type: boolean
|
||||
default: true
|
||||
disclamerAcknowledged:
|
||||
description: 'I confirm that there is an emergency and I can not use regular release workflow'
|
||||
required: true
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
concurrency:
|
||||
group: deploy-prod
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
deploy-prod-new:
|
||||
runs-on: prod
|
||||
container:
|
||||
image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
options: --user root --privileged
|
||||
if: inputs.deployStorage && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1, us-east-1 ]
|
||||
environment:
|
||||
name: prod-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
cd "$(pwd)/.github/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook -v deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-proxy-prod-new:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
if: inputs.deployProxy && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: prod-us-east-2-delta
|
||||
deploy_link_proxy: true
|
||||
deploy_legacy_scram_proxy: false
|
||||
- target_region: us-west-2
|
||||
target_cluster: prod-us-west-2-eta
|
||||
deploy_link_proxy: false
|
||||
deploy_legacy_scram_proxy: true
|
||||
- target_region: eu-central-1
|
||||
target_cluster: prod-eu-central-1-gamma
|
||||
deploy_link_proxy: false
|
||||
deploy_legacy_scram_proxy: false
|
||||
- target_region: ap-southeast-1
|
||||
target_cluster: prod-ap-southeast-1-epsilon
|
||||
deploy_link_proxy: false
|
||||
deploy_legacy_scram_proxy: false
|
||||
- target_region: us-east-1
|
||||
target_cluster: prod-us-east-1-theta
|
||||
deploy_link_proxy: false
|
||||
deploy_legacy_scram_proxy: false
|
||||
environment:
|
||||
name: prod-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Re-deploy scram proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
- name: Re-deploy link proxy
|
||||
if: matrix.deploy_link_proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
- name: Re-deploy legacy scram proxy
|
||||
if: matrix.deploy_legacy_scram_proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
deploy-storage-broker-prod-new:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: prod-us-east-2-delta
|
||||
- target_region: us-west-2
|
||||
target_cluster: prod-us-west-2-eta
|
||||
- target_region: eu-central-1
|
||||
target_cluster: prod-eu-central-1-gamma
|
||||
- target_region: ap-southeast-1
|
||||
target_cluster: prod-ap-southeast-1-epsilon
|
||||
- target_region: us-east-1
|
||||
target_cluster: prod-us-east-1-theta
|
||||
environment:
|
||||
name: prod-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||
|
||||
deploy-pg-sni-router:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
if: inputs.deployPgSniRouter && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: prod-us-east-2-delta
|
||||
- target_region: us-west-2
|
||||
target_cluster: prod-us-west-2-eta
|
||||
- target_region: eu-central-1
|
||||
target_cluster: prod-eu-central-1-gamma
|
||||
- target_region: ap-southeast-1
|
||||
target_cluster: prod-ap-southeast-1-epsilon
|
||||
- target_region: us-east-1
|
||||
target_cluster: prod-us-east-1-theta
|
||||
environment:
|
||||
name: prod-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Deploy pg-sni-router
|
||||
run:
|
||||
helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s
|
||||
@@ -2,7 +2,7 @@
|
||||
### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters.
|
||||
### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
|
||||
### inside this image in the real deployments.
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
ARG PG_VERSION
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
|
||||
@@ -393,28 +393,6 @@ RUN case "${PG_VERSION}" in \
|
||||
make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "kq-imcx-pg-build"
|
||||
# compile kq_imcx extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS kq-imcx-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
|
||||
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
|
||||
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
|
||||
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions"
|
||||
@@ -528,7 +506,6 @@ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
[](https://neon.tech)
|
||||
|
||||
# Neon
|
||||
|
||||
Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
|
||||
|
||||
@@ -30,7 +30,6 @@
|
||||
//! -b /usr/local/bin/postgres
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
@@ -68,54 +67,6 @@ fn main() -> Result<()> {
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
//
|
||||
// This is used to propagate the context for the 'start_compute' operation
|
||||
// from the neon control plane. This allows linking together the wider
|
||||
// 'start_compute' operation that creates the compute container, with the
|
||||
// startup actions here within the container.
|
||||
//
|
||||
// There is no standard for passing context in env variables, but a lot of
|
||||
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
|
||||
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
|
||||
//
|
||||
// Switch to the startup context here, and exit it once the startup has
|
||||
// completed and Postgres is up and running.
|
||||
//
|
||||
// If this pod is pre-created without binding it to any particular endpoint
|
||||
// yet, this isn't the right place to enter the startup context. In that
|
||||
// case, the control plane should pass the tracing context as part of the
|
||||
// /configure API call.
|
||||
//
|
||||
// NOTE: This is supposed to only cover the *startup* actions. Once
|
||||
// postgres is configured and up-and-running, we exit this span. Any other
|
||||
// actions that are performed on incoming HTTP requests, for example, are
|
||||
// performed in separate spans.
|
||||
//
|
||||
// XXX: If the pod is restarted, we perform the startup actions in the same
|
||||
// context as the original startup actions, which probably doesn't make
|
||||
// sense.
|
||||
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
|
||||
if let Ok(val) = std::env::var("TRACEPARENT") {
|
||||
startup_tracing_carrier.insert("traceparent".to_string(), val);
|
||||
}
|
||||
if let Ok(val) = std::env::var("TRACESTATE") {
|
||||
startup_tracing_carrier.insert("tracestate".to_string(), val);
|
||||
}
|
||||
let startup_context_guard = if !startup_tracing_carrier.is_empty() {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry::sdk::propagation::TraceContextPropagator;
|
||||
let guard = TraceContextPropagator::new()
|
||||
.extract(&startup_tracing_carrier)
|
||||
.attach();
|
||||
info!("startup tracing context attached");
|
||||
Some(guard)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let compute_id = matches.get_one::<String>("compute-id");
|
||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
||||
|
||||
@@ -197,6 +148,8 @@ fn main() -> Result<()> {
|
||||
|
||||
// We got all we need, update the state.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
let pspec = state.pspec.as_ref().expect("spec must be set");
|
||||
let startup_tracing_context = pspec.spec.startup_tracing_context.clone();
|
||||
|
||||
// Record for how long we slept waiting for the spec.
|
||||
state.metrics.wait_for_spec_ms = Utc::now()
|
||||
@@ -212,6 +165,29 @@ fn main() -> Result<()> {
|
||||
compute.state_changed.notify_all();
|
||||
drop(state);
|
||||
|
||||
// Extract OpenTelemetry context for the startup actions from the spec, and
|
||||
// attach it to the current tracing context.
|
||||
//
|
||||
// This is used to propagate the context for the 'start_compute' operation
|
||||
// from the neon control plane. This allows linking together the wider
|
||||
// 'start_compute' operation that creates the compute container, with the
|
||||
// startup actions here within the container.
|
||||
//
|
||||
// Switch to the startup context here, and exit it once the startup has
|
||||
// completed and Postgres is up and running.
|
||||
//
|
||||
// NOTE: This is supposed to only cover the *startup* actions. Once
|
||||
// postgres is configured and up-and-running, we exit this span. Any other
|
||||
// actions that are performed on incoming HTTP requests, for example, are
|
||||
// performed in separate spans.
|
||||
let startup_context_guard = if let Some(ref carrier) = startup_tracing_context {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry::sdk::propagation::TraceContextPropagator;
|
||||
Some(TraceContextPropagator::new().extract(carrier).attach())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Launch remaining service threads
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
let _configurator_handle =
|
||||
|
||||
@@ -338,27 +338,30 @@ impl ComputeNode {
|
||||
// In this case we need to connect with old `zenith_admin` name
|
||||
// and create new user. We cannot simply rename connected user,
|
||||
// but we can create a new one and grant it all privileges.
|
||||
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
|
||||
Err(e) => {
|
||||
info!(
|
||||
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
||||
e
|
||||
);
|
||||
let mut zenith_admin_connstr = self.connstr.clone();
|
||||
let mut client = {
|
||||
let _span = tracing::info_span!("connect").entered();
|
||||
match Client::connect(self.connstr.as_str(), NoTls) {
|
||||
Err(e) => {
|
||||
info!(
|
||||
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
||||
e
|
||||
);
|
||||
let mut zenith_admin_connstr = self.connstr.clone();
|
||||
|
||||
zenith_admin_connstr
|
||||
.set_username("zenith_admin")
|
||||
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
|
||||
zenith_admin_connstr
|
||||
.set_username("zenith_admin")
|
||||
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
|
||||
|
||||
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
drop(client);
|
||||
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
drop(client);
|
||||
|
||||
// reconnect with connsting with expected name
|
||||
Client::connect(self.connstr.as_str(), NoTls)?
|
||||
// reconnect with connsting with expected name
|
||||
Client::connect(self.connstr.as_str(), NoTls)?
|
||||
}
|
||||
Ok(client) => client,
|
||||
}
|
||||
Ok(client) => client,
|
||||
};
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
//! and connect it to the storage nodes.
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// String type alias representing Postgres identifier and
|
||||
@@ -30,6 +31,8 @@ pub struct ComputeSpec {
|
||||
pub mode: ComputeMode,
|
||||
|
||||
pub storage_auth_token: Option<String>,
|
||||
|
||||
pub startup_tracing_context: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
|
||||
@@ -48,33 +48,13 @@ pub enum TenantState {
|
||||
}
|
||||
|
||||
impl TenantState {
|
||||
pub fn attachment_status(&self) -> TenantAttachmentStatus {
|
||||
use TenantAttachmentStatus::*;
|
||||
pub fn has_in_progress_downloads(&self) -> bool {
|
||||
match self {
|
||||
// The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
|
||||
// So, technically, we can return Attached here.
|
||||
// However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
|
||||
// But, our attach task might still be fetching the remote timelines, etc.
|
||||
// So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
|
||||
Self::Attaching => Maybe,
|
||||
// tenant mgr startup distinguishes attaching from loading via marker file.
|
||||
// If it's loading, there is no attach marker file, i.e., attach had finished in the past.
|
||||
Self::Loading => Attached,
|
||||
// We only reach Active after successful load / attach.
|
||||
// So, call atttachment status Attached.
|
||||
Self::Active => Attached,
|
||||
// If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
|
||||
// However, it also becomes Broken if the regular load fails.
|
||||
// We would need a separate TenantState variant to distinguish these cases.
|
||||
// However, there's no practical difference from Console's perspective.
|
||||
// It will run a Postgres-level health check as soon as it observes Attached.
|
||||
// That will fail on Broken tenants.
|
||||
// Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
|
||||
Self::Broken { .. } => Attached,
|
||||
// Why is Stopping a Maybe case? Because, during pageserver shutdown,
|
||||
// we set the Stopping state irrespective of whether the tenant
|
||||
// has finished attaching or not.
|
||||
Self::Stopping => Maybe,
|
||||
Self::Loading => true,
|
||||
Self::Attaching => true,
|
||||
Self::Active => false,
|
||||
Self::Stopping => false,
|
||||
Self::Broken { .. } => false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -229,25 +209,16 @@ impl TenantConfigRequest {
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum TenantAttachmentStatus {
|
||||
Maybe,
|
||||
Attached,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TenantInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
|
||||
pub state: TenantState,
|
||||
/// Sum of the size of all layer files.
|
||||
/// If a layer is present in both local FS and S3, it counts only once.
|
||||
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
|
||||
pub attachment_status: TenantAttachmentStatus,
|
||||
pub has_in_progress_downloads: Option<bool>,
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
@@ -720,7 +691,7 @@ mod tests {
|
||||
id: TenantId::generate(),
|
||||
state: TenantState::Active,
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
has_in_progress_downloads: Some(false),
|
||||
};
|
||||
let expected_active = json!({
|
||||
"id": original_active.id.to_string(),
|
||||
@@ -728,7 +699,7 @@ mod tests {
|
||||
"slug": "Active",
|
||||
},
|
||||
"current_physical_size": 42,
|
||||
"attachment_status": "attached",
|
||||
"has_in_progress_downloads": false,
|
||||
});
|
||||
|
||||
let original_broken = TenantInfo {
|
||||
@@ -738,7 +709,7 @@ mod tests {
|
||||
backtrace: "backtrace info".into(),
|
||||
},
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
has_in_progress_downloads: Some(false),
|
||||
};
|
||||
let expected_broken = json!({
|
||||
"id": original_broken.id.to_string(),
|
||||
@@ -750,7 +721,7 @@ mod tests {
|
||||
}
|
||||
},
|
||||
"current_physical_size": 42,
|
||||
"attachment_status": "attached",
|
||||
"has_in_progress_downloads": false,
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
|
||||
@@ -146,10 +146,6 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
|
||||
pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
|
||||
// From replication/message.h
|
||||
pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;
|
||||
|
||||
// From rmgrlist.h
|
||||
pub const RM_XLOG_ID: u8 = 0;
|
||||
pub const RM_XACT_ID: u8 = 1;
|
||||
pub const RM_SMGR_ID: u8 = 2;
|
||||
@@ -161,7 +157,6 @@ pub const RM_RELMAP_ID: u8 = 7;
|
||||
pub const RM_STANDBY_ID: u8 = 8;
|
||||
pub const RM_HEAP2_ID: u8 = 9;
|
||||
pub const RM_HEAP_ID: u8 = 10;
|
||||
pub const RM_LOGICALMSG_ID: u8 = 21;
|
||||
|
||||
// from xlogreader.h
|
||||
pub const XLR_INFO_MASK: u8 = 0x0F;
|
||||
|
||||
@@ -128,15 +128,6 @@ impl RemoteStorage for LocalFs {
|
||||
// We need this dance with sort of durable rename (without fsyncs)
|
||||
// to prevent partial uploads. This was really hit when pageserver shutdown
|
||||
// cancelled the upload and partial file was left on the fs
|
||||
// NOTE: Because temp file suffix always the same this operation is racy.
|
||||
// Two concurrent operations can lead to the following sequence:
|
||||
// T1: write(temp)
|
||||
// T2: write(temp) -> overwrites the content
|
||||
// T1: rename(temp, dst) -> succeeds
|
||||
// T2: rename(temp, dst) -> fails, temp no longet exists
|
||||
// This can be solved by supplying unique temp suffix every time, but this situation
|
||||
// is not normal in the first place, the error can help (and helped at least once)
|
||||
// to discover bugs in upper level synchronization.
|
||||
let temp_file_path =
|
||||
path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX);
|
||||
let mut destination = io::BufWriter::new(
|
||||
|
||||
@@ -1,21 +1,21 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
PG_BIN=$1
|
||||
WAL_PATH=$2
|
||||
DATA_DIR=$3
|
||||
PORT=$4
|
||||
SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
|
||||
rm -fr "$DATA_DIR"
|
||||
env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
|
||||
echo port="$PORT" >> "$DATA_DIR"/postgresql.conf
|
||||
REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
|
||||
SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
|
||||
rm -fr $DATA_DIR
|
||||
env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID
|
||||
echo port=$PORT >> $DATA_DIR/postgresql.conf
|
||||
REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
|
||||
declare -i WAL_SIZE=$REDO_POS+114
|
||||
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start
|
||||
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate
|
||||
cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
|
||||
cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
|
||||
for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
|
||||
dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
|
||||
$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start
|
||||
$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate
|
||||
cp $DATA_DIR/pg_wal/000000010000000000000001 .
|
||||
cp $WAL_PATH/* $DATA_DIR/pg_wal/
|
||||
if [ -f $DATA_DIR/pg_wal/*.partial ]
|
||||
then
|
||||
(cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done)
|
||||
fi
|
||||
dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
|
||||
rm -f 000000010000000000000001
|
||||
|
||||
20
libs/utils/scripts/restore_from_wal_archive.sh
Executable file
20
libs/utils/scripts/restore_from_wal_archive.sh
Executable file
@@ -0,0 +1,20 @@
|
||||
PG_BIN=$1
|
||||
WAL_PATH=$2
|
||||
DATA_DIR=$3
|
||||
PORT=$4
|
||||
SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
|
||||
rm -fr $DATA_DIR /tmp/pg_wals
|
||||
mkdir /tmp/pg_wals
|
||||
env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID
|
||||
echo port=$PORT >> $DATA_DIR/postgresql.conf
|
||||
REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
|
||||
declare -i WAL_SIZE=$REDO_POS+114
|
||||
cp $WAL_PATH/* /tmp/pg_wals
|
||||
if [ -f $DATA_DIR/pg_wal/*.partial ]
|
||||
then
|
||||
(cd /tmp/pg_wals ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done)
|
||||
fi
|
||||
dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
|
||||
echo > $DATA_DIR/recovery.signal
|
||||
rm -f $DATA_DIR/pg_wal/*
|
||||
echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf
|
||||
@@ -60,43 +60,24 @@ pub mod tracing_span_assert;
|
||||
|
||||
pub mod rate_limit;
|
||||
|
||||
mod failpoint_macro_helpers {
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
///
|
||||
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
||||
/// specified time (in milliseconds). The main difference is that we use async
|
||||
/// tokio sleep function. Another difference is that we print lines to the log,
|
||||
/// which can be useful in tests to check that the failpoint was hit.
|
||||
#[macro_export]
|
||||
macro_rules! failpoint_sleep_millis_async {
|
||||
($name:literal) => {{
|
||||
// If the failpoint is used with a "return" action, set should_sleep to the
|
||||
// returned value (as string). Otherwise it's set to None.
|
||||
let should_sleep = (|| {
|
||||
::fail::fail_point!($name, |x| x);
|
||||
::std::option::Option::None
|
||||
})();
|
||||
|
||||
// Sleep if the action was a returned value
|
||||
if let ::std::option::Option::Some(duration_str) = should_sleep {
|
||||
$crate::failpoint_sleep_helper($name, duration_str).await
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
// Helper function used by the macro. (A function has nicer scoping so we
|
||||
// don't need to decorate everything with "::")
|
||||
pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
|
||||
let millis = duration_str.parse::<u64>().unwrap();
|
||||
let d = std::time::Duration::from_millis(millis);
|
||||
|
||||
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
|
||||
tokio::time::sleep(d).await;
|
||||
tracing::info!("failpoint {:?}: sleep done", name);
|
||||
}
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
#[macro_export]
|
||||
macro_rules! failpoint_sleep_millis_async {
|
||||
($name:literal) => {{
|
||||
let should_sleep: Option<std::time::Duration> = (|| {
|
||||
fail::fail_point!($name, |v: Option<_>| {
|
||||
let millis = v.unwrap().parse::<u64>().unwrap();
|
||||
Some(Duration::from_millis(millis))
|
||||
});
|
||||
None
|
||||
})();
|
||||
if let Some(d) = should_sleep {
|
||||
tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
|
||||
tokio::time::sleep(d).await;
|
||||
tracing::info!("failpoint {:?}: sleep done", $name);
|
||||
}
|
||||
}};
|
||||
}
|
||||
pub use failpoint_macro_helpers::failpoint_sleep_helper;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
//!
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::{mgr, LogicalSizeCalculationCause};
|
||||
use crate::tenant::mgr;
|
||||
use anyhow;
|
||||
use chrono::Utc;
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
@@ -164,8 +164,7 @@ pub async fn collect_metrics_iteration(
|
||||
timeline_written_size,
|
||||
));
|
||||
|
||||
let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
|
||||
match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
|
||||
match timeline.get_current_logical_size(ctx) {
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
Ok((size, is_exact)) if is_exact => {
|
||||
current_metrics.push((
|
||||
@@ -335,9 +334,7 @@ pub async fn calculate_synthetic_size_worker(
|
||||
|
||||
if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
|
||||
{
|
||||
if let Err(e) = tenant.calculate_synthetic_size(
|
||||
LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize,
|
||||
ctx).await {
|
||||
if let Err(e) = tenant.calculate_synthetic_size(ctx).await {
|
||||
error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -346,23 +346,23 @@ paths:
|
||||
starts writing to the tenant's S3 state unless it receives one of the
|
||||
distinguished errors below that state otherwise.
|
||||
|
||||
If a client receives a not-distinguished response, e.g., a network timeout,
|
||||
it MUST retry the /attach request and poll again for the tenant's
|
||||
attachment status.
|
||||
|
||||
After the client has received a 202, it MUST poll the tenant's
|
||||
attachment status (field `attachment_status`) to reach state `attached`.
|
||||
If the `attachment_status` is missing, the client MUST retry the `/attach`
|
||||
request (goto previous paragraph). This is a robustness measure in case the tenant
|
||||
status endpoint is buggy, but the attach operation is ongoing.
|
||||
The method to identify whether a request has arrived at the pageserver, and
|
||||
whether it has succeeded, is to poll for the tenant status to reach "Active"
|
||||
or "Broken" state. These values are currently not explicitly documented in
|
||||
the API spec.
|
||||
Polling for `has_in_progress_downloads == false` is INCORRECT because that
|
||||
value can turn `false` during shutdown while the Attach operation is still
|
||||
unfinished.
|
||||
|
||||
There is no way to cancel an in-flight request.
|
||||
|
||||
In any case, the client
|
||||
* MUST NOT ASSUME that the /attach request has been lost in the network,
|
||||
* MUST NOT ASSUME that the request has been lost, based on the observation
|
||||
that a subsequent tenant status request returns 404. The request may
|
||||
still be in flight. It must be retried.
|
||||
If a client receives a not-distinguished response, e.g., a network timeout,
|
||||
it MUST retry the /attach request and poll again for tenant status.
|
||||
|
||||
In any case, it must
|
||||
* NOT ASSUME that the /attach request has been lost in the network,
|
||||
* NOT ASSUME that the request has been lost based on a subsequent
|
||||
tenant status request returning 404. (The request may still be in flight!)
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant attaching scheduled
|
||||
@@ -888,27 +888,13 @@ components:
|
||||
type: object
|
||||
required:
|
||||
- id
|
||||
- attachment_status
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
current_physical_size:
|
||||
type: integer
|
||||
attachment_status:
|
||||
description: |
|
||||
Status of this tenant's attachment to this pageserver.
|
||||
|
||||
- `maybe` means almost nothing, don't read anything into it
|
||||
except for the fact that the pageserver _might_ be already
|
||||
writing to the tenant's S3 state, so, DO NOT ATTACH the
|
||||
tenant to any other pageserver, or we risk split-brain.
|
||||
- `attached` means that the attach operation has completed,
|
||||
maybe successfully, maybe not. Perform a health check at
|
||||
the Postgres level to determine healthiness of the tenant.
|
||||
|
||||
See the tenant `/attach` endpoint for more information.
|
||||
type: string
|
||||
enum: [ "maybe", "attached" ]
|
||||
has_in_progress_downloads:
|
||||
type: boolean
|
||||
TenantCreateInfo:
|
||||
type: object
|
||||
properties:
|
||||
|
||||
@@ -25,7 +25,7 @@ use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
@@ -105,9 +105,6 @@ impl From<PageReconstructError> for ApiError {
|
||||
PageReconstructError::Cancelled => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||
}
|
||||
PageReconstructError::AncestorStopping(_) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(pre))
|
||||
}
|
||||
PageReconstructError::WalRedo(pre) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(pre))
|
||||
}
|
||||
@@ -172,8 +169,6 @@ async fn build_timeline_info(
|
||||
include_non_incremental_logical_size: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let mut info = build_timeline_info_common(timeline, ctx)?;
|
||||
if include_non_incremental_logical_size {
|
||||
// XXX we should be using spawn_ondemand_logical_size_calculation here.
|
||||
@@ -196,7 +191,6 @@ fn build_timeline_info_common(
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
||||
let guard = timeline.last_received_wal.lock().unwrap();
|
||||
@@ -269,28 +263,25 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
|
||||
|
||||
async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_start_lsn,
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||
&ctx,
|
||||
)
|
||||
.await {
|
||||
Ok(Some(new_timeline)) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
|
||||
Err(err) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_start_lsn,
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.await
|
||||
.await {
|
||||
Ok(Some(new_timeline)) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
|
||||
Err(err) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -312,7 +303,6 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
|
||||
.await
|
||||
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
@@ -477,7 +467,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
id: *id,
|
||||
state: state.clone(),
|
||||
current_physical_size: None,
|
||||
attachment_status: state.attachment_status(),
|
||||
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>();
|
||||
|
||||
@@ -502,7 +492,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
id: tenant_id,
|
||||
state: state.clone(),
|
||||
current_physical_size: Some(current_physical_size),
|
||||
attachment_status: state.attachment_status(),
|
||||
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
|
||||
})
|
||||
}
|
||||
.instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
|
||||
@@ -537,11 +527,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
|
||||
// this can be long operation
|
||||
let inputs = tenant
|
||||
.gather_size_inputs(
|
||||
retention_period,
|
||||
LogicalSizeCalculationCause::TenantSizeHandler,
|
||||
&ctx,
|
||||
)
|
||||
.gather_size_inputs(retention_period, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::ops::Range;
|
||||
///
|
||||
/// Represents a set of Keys, in a compact form.
|
||||
///
|
||||
#[derive(Clone, Debug, Default)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct KeySpace {
|
||||
/// Contiguous ranges of keys that belong to the key space. In key order,
|
||||
/// and with no overlap.
|
||||
@@ -61,58 +61,6 @@ impl KeySpace {
|
||||
|
||||
KeyPartitioning { parts }
|
||||
}
|
||||
|
||||
///
|
||||
/// Calculate logical size of delta layers: total size of all blocks covered by it's key range
|
||||
///
|
||||
pub fn get_logical_size(&self, range: &Range<Key>) -> u64 {
|
||||
let mut start_key = range.start;
|
||||
let n_ranges = self.ranges.len();
|
||||
let start_index = match self.ranges.binary_search_by_key(&start_key, |r| r.start) {
|
||||
Ok(index) => index, // keyspace range starts with start_key
|
||||
Err(index) => {
|
||||
if index != 0 && self.ranges[index - 1].end > start_key {
|
||||
index - 1 // previous keyspace range overlaps with specified
|
||||
} else if index == n_ranges {
|
||||
return 0; // no intersection with specified range
|
||||
} else {
|
||||
start_key = self.ranges[index].start;
|
||||
index
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut size = 0u64;
|
||||
for i in start_index..n_ranges {
|
||||
if self.ranges[i].start >= range.end {
|
||||
break;
|
||||
}
|
||||
let end_key = if self.ranges[i].end < range.end {
|
||||
self.ranges[i].end
|
||||
} else {
|
||||
range.end
|
||||
};
|
||||
let n_blocks = key_range_size(&(start_key..end_key));
|
||||
if n_blocks != u32::MAX {
|
||||
size += n_blocks as u64 * BLCKSZ as u64;
|
||||
}
|
||||
if i + 1 < n_ranges {
|
||||
start_key = self.ranges[i + 1].start;
|
||||
}
|
||||
}
|
||||
size
|
||||
}
|
||||
|
||||
///
|
||||
/// Check if key space contains overlapping range
|
||||
///
|
||||
pub fn overlaps(&self, range: &Range<Key>) -> bool {
|
||||
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
|
||||
Ok(0) => false,
|
||||
Err(0) => false,
|
||||
Ok(index) => self.ranges[index - 1].end > range.start,
|
||||
Err(index) => self.ranges[index - 1].end > range.start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -181,226 +129,3 @@ impl KeySpaceAccum {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A helper object, to collect a set of keys and key ranges into a KeySpace
|
||||
/// object. Key ranges may be inserted in any order and can overlap.
|
||||
///
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct KeySpaceRandomAccum {
|
||||
ranges: Vec<Range<Key>>,
|
||||
}
|
||||
|
||||
impl KeySpaceRandomAccum {
|
||||
pub fn new() -> Self {
|
||||
Self { ranges: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn add_key(&mut self, key: Key) {
|
||||
self.add_range(singleton_range(key))
|
||||
}
|
||||
|
||||
pub fn add_range(&mut self, range: Range<Key>) {
|
||||
self.ranges.push(range);
|
||||
}
|
||||
|
||||
pub fn to_keyspace(mut self) -> KeySpace {
|
||||
let mut ranges = Vec::new();
|
||||
if !self.ranges.is_empty() {
|
||||
self.ranges.sort_by_key(|r| r.start);
|
||||
let mut start = self.ranges.first().unwrap().start;
|
||||
let mut end = self.ranges.first().unwrap().end;
|
||||
for r in self.ranges {
|
||||
assert!(r.start >= start);
|
||||
if r.start > end {
|
||||
ranges.push(start..end);
|
||||
start = r.start;
|
||||
end = r.end;
|
||||
} else if r.end > end {
|
||||
end = r.end;
|
||||
}
|
||||
}
|
||||
ranges.push(start..end);
|
||||
}
|
||||
KeySpace { ranges }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fmt::Write;
|
||||
|
||||
// Helper function to create a key range.
|
||||
//
|
||||
// Make the tests below less verbose.
|
||||
fn kr(irange: Range<i128>) -> Range<Key> {
|
||||
Key::from_i128(irange.start)..Key::from_i128(irange.end)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn dump_keyspace(ks: &KeySpace) {
|
||||
for r in ks.ranges.iter() {
|
||||
println!(" {}..{}", r.start.to_i128(), r.end.to_i128());
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_ks_eq(actual: &KeySpace, expected: Vec<Range<Key>>) {
|
||||
if actual.ranges != expected {
|
||||
let mut msg = String::new();
|
||||
|
||||
writeln!(msg, "expected:").unwrap();
|
||||
for r in &expected {
|
||||
writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap();
|
||||
}
|
||||
writeln!(msg, "got:").unwrap();
|
||||
for r in &actual.ranges {
|
||||
writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap();
|
||||
}
|
||||
panic!("{}", msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyspace_add_range() {
|
||||
// two separate ranges
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(0..10));
|
||||
ks.add_range(kr(20..30));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..10), kr(20..30)]);
|
||||
|
||||
// two separate ranges, added in reverse order
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(20..30));
|
||||
ks.add_range(kr(0..10));
|
||||
|
||||
// add range that is adjacent to the end of an existing range
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
ks.add_range(kr(0..10));
|
||||
ks.add_range(kr(10..30));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add range that is adjacent to the start of an existing range
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(10..30));
|
||||
ks.add_range(kr(0..10));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add range that overlaps with the end of an existing range
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(0..10));
|
||||
ks.add_range(kr(5..30));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add range that overlaps with the start of an existing range
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(5..30));
|
||||
ks.add_range(kr(0..10));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add range that is fully covered by an existing range
|
||||
//
|
||||
// #########
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(0..30));
|
||||
ks.add_range(kr(10..20));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add range that extends an existing range from both ends
|
||||
//
|
||||
// #####
|
||||
// #########
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(10..20));
|
||||
ks.add_range(kr(0..30));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add a range that overlaps with two existing ranges, joining them
|
||||
//
|
||||
// ##### #####
|
||||
// #######
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(0..10));
|
||||
ks.add_range(kr(20..30));
|
||||
ks.add_range(kr(5..25));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyspace_overlaps() {
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(10..20));
|
||||
ks.add_range(kr(30..40));
|
||||
let ks = ks.to_keyspace();
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(0..5)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(5..9)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(5..10)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(ks.overlaps(&kr(5..11)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(ks.overlaps(&kr(10..15)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(ks.overlaps(&kr(15..20)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(ks.overlaps(&kr(15..25)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(22..28)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(25..30)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(ks.overlaps(&kr(35..35)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(40..45)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(45..50)));
|
||||
|
||||
// ##### #####
|
||||
// xxxxxxxxxxx
|
||||
assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,7 +30,6 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
|
||||
"create images",
|
||||
"init logical size",
|
||||
"logical size",
|
||||
"imitate logical size",
|
||||
"load layer map",
|
||||
"gc",
|
||||
];
|
||||
@@ -187,16 +186,6 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_eviction_iteration_duration_seconds_global",
|
||||
"Time spent on a single eviction iteration",
|
||||
&["period_secs", "threshold_secs"],
|
||||
STORAGE_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_evictions",
|
||||
@@ -489,15 +478,6 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("Failed to register tenant_task_events metric")
|
||||
});
|
||||
|
||||
pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_background_loop_period_overrun_count",
|
||||
"Incremented whenever warn_when_period_overrun() logs a warning.",
|
||||
&["task", "period"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// walreceiver metrics
|
||||
|
||||
pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
@@ -708,7 +688,6 @@ pub struct TimelineMetrics {
|
||||
pub compact_time_histo: StorageTimeMetrics,
|
||||
pub create_images_time_histo: StorageTimeMetrics,
|
||||
pub logical_size_histo: StorageTimeMetrics,
|
||||
pub imitate_logical_size_histo: StorageTimeMetrics,
|
||||
pub load_layer_map_histo: StorageTimeMetrics,
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
@@ -741,8 +720,6 @@ impl TimelineMetrics {
|
||||
let create_images_time_histo =
|
||||
StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
|
||||
let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
|
||||
let imitate_logical_size_histo =
|
||||
StorageTimeMetrics::new("imitate logical size", &tenant_id, &timeline_id);
|
||||
let load_layer_map_histo =
|
||||
StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
|
||||
let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id);
|
||||
@@ -779,7 +756,6 @@ impl TimelineMetrics {
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
logical_size_histo,
|
||||
imitate_logical_size_histo,
|
||||
garbage_collect_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
@@ -1240,7 +1216,4 @@ pub fn preinitialize_metrics() {
|
||||
// Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
|
||||
assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
|
||||
UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
|
||||
|
||||
// Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
|
||||
BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
|
||||
}
|
||||
|
||||
@@ -256,10 +256,7 @@ async fn page_service_conn_main(
|
||||
//
|
||||
// no write timeout is used, because the kernel is assumed to error writes after some time.
|
||||
let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
|
||||
|
||||
// timeout should be lower, but trying out multiple days for
|
||||
// <https://github.com/neondatabase/neon/issues/4205>
|
||||
socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
|
||||
socket.set_timeout(Some(std::time::Duration::from_secs(60 * 10)));
|
||||
let socket = std::pin::pin!(socket);
|
||||
|
||||
// XXX: pgbackend.run() should take the connection_ctx,
|
||||
|
||||
@@ -500,8 +500,6 @@ impl Timeline {
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
||||
|
||||
@@ -58,8 +58,6 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::metadata::load_metadata;
|
||||
use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError;
|
||||
use crate::tenant::storage_layer::DeltaLayer;
|
||||
use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
@@ -97,10 +95,7 @@ mod timeline;
|
||||
|
||||
pub mod size;
|
||||
|
||||
pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
pub use timeline::{
|
||||
LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
|
||||
};
|
||||
pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline};
|
||||
|
||||
// re-export this function so that page_cache.rs can use it.
|
||||
pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
|
||||
@@ -596,19 +591,16 @@ impl Tenant {
|
||||
/// finishes. You can use wait_until_active() to wait for the task to
|
||||
/// complete.
|
||||
///
|
||||
pub(crate) fn spawn_attach(
|
||||
pub fn spawn_attach(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
) -> Arc<Tenant> {
|
||||
// XXX: Attach should provide the config, especially during tenant migration.
|
||||
// See https://github.com/neondatabase/neon/issues/1555
|
||||
let tenant_conf = TenantConfOpt::default();
|
||||
|
||||
Self::attach_idempotent_create_marker_file(conf, tenant_id)
|
||||
.context("create attach marker file")?;
|
||||
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
TenantState::Attaching,
|
||||
@@ -641,46 +633,7 @@ impl Tenant {
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
fn attach_idempotent_create_marker_file(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
// Create directory with marker file to indicate attaching state.
|
||||
// The load_local_tenants() function in tenant::mgr relies on the marker file
|
||||
// to determine whether a tenant has finished attaching.
|
||||
let tenant_dir = conf.tenant_path(&tenant_id);
|
||||
let marker_file = conf.tenant_attaching_mark_file_path(&tenant_id);
|
||||
debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir);
|
||||
// TODO: should use tokio::fs here, but
|
||||
// 1. caller is not async, for good reason (it holds tenants map lock)
|
||||
// 2. we'd need to think about cancel safety. Turns out dropping a tokio::fs future
|
||||
// doesn't wait for the activity in the fs thread pool.
|
||||
crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?;
|
||||
match fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(&marker_file)
|
||||
{
|
||||
Ok(_) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
|
||||
// Either this is a retry of attach or there is a concurrent task also doing attach for this tenant.
|
||||
// We cannot distinguish this here.
|
||||
// The caller is responsible for ensuring there's no concurrent attach for a tenant.
|
||||
{} // fsync again, we don't know if that already happened
|
||||
}
|
||||
err => {
|
||||
err.context("create tenant attaching marker file")?;
|
||||
unreachable!("we covered the Ok() case above");
|
||||
}
|
||||
}
|
||||
crashsafe::fsync_file_and_parent(&marker_file)
|
||||
.context("fsync tenant attaching marker file and parent")?;
|
||||
debug_assert!(tenant_dir.is_dir());
|
||||
debug_assert!(marker_file.is_file());
|
||||
Ok(())
|
||||
tenant
|
||||
}
|
||||
|
||||
///
|
||||
@@ -688,15 +641,26 @@ impl Tenant {
|
||||
///
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||
async fn attach(self: &Arc<Tenant>, ctx: RequestContext) -> anyhow::Result<()> {
|
||||
// Create directory with marker file to indicate attaching state.
|
||||
// The load_local_tenants() function in tenant::mgr relies on the marker file
|
||||
// to determine whether a tenant has finished attaching.
|
||||
let tenant_dir = self.conf.tenant_path(&self.tenant_id);
|
||||
let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
|
||||
if !tokio::fs::try_exists(&marker_file)
|
||||
.await
|
||||
.context("check for existence of marker file")?
|
||||
{
|
||||
anyhow::bail!(
|
||||
"implementation error: marker file should exist at beginning of this function"
|
||||
);
|
||||
debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir);
|
||||
if tenant_dir.exists() {
|
||||
if !marker_file.is_file() {
|
||||
anyhow::bail!(
|
||||
"calling Tenant::attach with a tenant directory that doesn't have the attaching marker file:\ntenant_dir: {}\nmarker_file: {}",
|
||||
tenant_dir.display(), marker_file.display());
|
||||
}
|
||||
} else {
|
||||
crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?;
|
||||
fs::File::create(&marker_file).context("create tenant attaching marker file")?;
|
||||
crashsafe::fsync_file_and_parent(&marker_file)
|
||||
.context("fsync tenant attaching marker file and parent")?;
|
||||
}
|
||||
debug_assert!(tenant_dir.is_dir());
|
||||
debug_assert!(marker_file.is_file());
|
||||
|
||||
// Get list of remote timelines
|
||||
// download index files for every tenant timeline
|
||||
@@ -734,9 +698,16 @@ impl Tenant {
|
||||
.await
|
||||
.context("download index file")?;
|
||||
|
||||
let remote_metadata = index_part.parse_metadata().context("parse metadata")?;
|
||||
|
||||
debug!("finished index part download");
|
||||
|
||||
Result::<_, anyhow::Error>::Ok((timeline_id, client, index_part))
|
||||
Result::<_, anyhow::Error>::Ok((
|
||||
timeline_id,
|
||||
client,
|
||||
index_part,
|
||||
remote_metadata,
|
||||
))
|
||||
}
|
||||
.map(move |res| {
|
||||
res.with_context(|| format!("download index part for timeline {timeline_id}"))
|
||||
@@ -745,26 +716,17 @@ impl Tenant {
|
||||
);
|
||||
}
|
||||
// Wait for all the download tasks to complete & collect results.
|
||||
let mut remote_index_and_client = HashMap::new();
|
||||
let mut remote_clients = HashMap::new();
|
||||
let mut index_parts = HashMap::new();
|
||||
let mut timeline_ancestors = HashMap::new();
|
||||
while let Some(result) = part_downloads.join_next().await {
|
||||
// NB: we already added timeline_id as context to the error
|
||||
let result: Result<_, anyhow::Error> = result.context("joinset task join")?;
|
||||
let (timeline_id, client, index_part) = result?;
|
||||
let (timeline_id, client, index_part, remote_metadata) = result?;
|
||||
debug!("successfully downloaded index part for timeline {timeline_id}");
|
||||
match index_part {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => {
|
||||
timeline_ancestors.insert(
|
||||
timeline_id,
|
||||
index_part.parse_metadata().context("parse_metadata")?,
|
||||
);
|
||||
remote_index_and_client.insert(timeline_id, (index_part, client));
|
||||
}
|
||||
MaybeDeletedIndexPart::Deleted => {
|
||||
info!("timeline {} is deleted, skipping", timeline_id);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
timeline_ancestors.insert(timeline_id, remote_metadata);
|
||||
index_parts.insert(timeline_id, index_part);
|
||||
remote_clients.insert(timeline_id, client);
|
||||
}
|
||||
|
||||
// For every timeline, download the metadata file, scan the local directory,
|
||||
@@ -772,16 +734,12 @@ impl Tenant {
|
||||
// layer file.
|
||||
let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
|
||||
for (timeline_id, remote_metadata) in sorted_timelines {
|
||||
let (index_part, remote_client) = remote_index_and_client
|
||||
.remove(&timeline_id)
|
||||
.expect("just put it in above");
|
||||
|
||||
// TODO again handle early failure
|
||||
self.load_remote_timeline(
|
||||
timeline_id,
|
||||
index_part,
|
||||
index_parts.remove(&timeline_id).unwrap(),
|
||||
remote_metadata,
|
||||
remote_client,
|
||||
remote_clients.remove(&timeline_id).unwrap(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -873,15 +831,11 @@ impl Tenant {
|
||||
}
|
||||
|
||||
/// Create a placeholder Tenant object for a broken tenant
|
||||
pub fn create_broken_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
reason: String,
|
||||
) -> Arc<Tenant> {
|
||||
pub fn create_broken_tenant(conf: &'static PageServerConf, tenant_id: TenantId) -> Arc<Tenant> {
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
Arc::new(Tenant::new(
|
||||
TenantState::Broken {
|
||||
reason,
|
||||
reason: "create_broken_tenant".into(),
|
||||
backtrace: String::new(),
|
||||
},
|
||||
conf,
|
||||
@@ -914,7 +868,7 @@ impl Tenant {
|
||||
Ok(conf) => conf,
|
||||
Err(e) => {
|
||||
error!("load tenant config failed: {:?}", e);
|
||||
return Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"));
|
||||
return Tenant::create_broken_tenant(conf, tenant_id);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1091,13 +1045,21 @@ impl Tenant {
|
||||
/// Subroutine of `load_tenant`, to load an individual timeline
|
||||
///
|
||||
/// NB: The parent is assumed to be already loaded!
|
||||
#[instrument(skip_all, fields(timeline_id))]
|
||||
#[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))]
|
||||
async fn load_local_timeline(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
local_metadata: TimelineMetadata,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
|
||||
let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
|
||||
.with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
|
||||
Some(ancestor_timeline)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
|
||||
RemoteTimelineClient::new(
|
||||
remote_storage.clone(),
|
||||
@@ -1110,29 +1072,6 @@ impl Tenant {
|
||||
let remote_startup_data = match &remote_client {
|
||||
Some(remote_client) => match remote_client.download_index_file().await {
|
||||
Ok(index_part) => {
|
||||
let index_part = match index_part {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
|
||||
MaybeDeletedIndexPart::Deleted => {
|
||||
// TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation.
|
||||
// Example:
|
||||
// start deletion operation
|
||||
// finishes upload of index part
|
||||
// pageserver crashes
|
||||
// remote storage gets de-configured
|
||||
// pageserver starts
|
||||
//
|
||||
// We don't really anticipate remote storage to be de-configured, so, for now, this is fine.
|
||||
// Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099.
|
||||
info!("is_deleted is set on remote, resuming removal of local data originally done by timeline deletion handler");
|
||||
std::fs::remove_dir_all(
|
||||
self.conf.timeline_path(&timeline_id, &self.tenant_id),
|
||||
)
|
||||
.context("remove_dir_all")?;
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let remote_metadata = index_part.parse_metadata().context("parse_metadata")?;
|
||||
Some(RemoteStartupData {
|
||||
index_part,
|
||||
@@ -1148,14 +1087,6 @@ impl Tenant {
|
||||
None => None,
|
||||
};
|
||||
|
||||
let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
|
||||
let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
|
||||
.with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
|
||||
Some(ancestor_timeline)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
remote_client,
|
||||
@@ -1264,24 +1195,8 @@ impl Tenant {
|
||||
"Cannot create timelines on inactive tenant"
|
||||
);
|
||||
|
||||
if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
|
||||
if self.get_timeline(new_timeline_id, false).is_ok() {
|
||||
debug!("timeline {new_timeline_id} already exists");
|
||||
|
||||
if let Some(remote_client) = existing.remote_client.as_ref() {
|
||||
// Wait for uploads to complete, so that when we return Ok, the timeline
|
||||
// is known to be durable on remote storage. Just like we do at the end of
|
||||
// this function, after we have created the timeline ourselves.
|
||||
//
|
||||
// We only really care that the initial version of `index_part.json` has
|
||||
// been uploaded. That's enough to remember that the timeline
|
||||
// exists. However, there is no function to wait specifically for that so
|
||||
// we just wait for all in-progress uploads to finish.
|
||||
remote_client
|
||||
.wait_completion()
|
||||
.await
|
||||
.context("wait for timeline uploads to complete")?;
|
||||
}
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
@@ -1323,17 +1238,6 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
|
||||
// Wait for the upload of the 'index_part.json` file to finish, so that when we return
|
||||
// Ok, the timeline is durable in remote storage.
|
||||
let kind = ancestor_timeline_id
|
||||
.map(|_| "branched")
|
||||
.unwrap_or("bootstrapped");
|
||||
remote_client.wait_completion().await.with_context(|| {
|
||||
format!("wait for {} timeline initial uploads to complete", kind)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(Some(loaded_timeline))
|
||||
}
|
||||
|
||||
@@ -1430,8 +1334,6 @@ impl Tenant {
|
||||
timeline_id: TimelineId,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
timeline::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Transition the timeline into TimelineState::Stopping.
|
||||
// This should prevent new operations from starting.
|
||||
let timeline = {
|
||||
@@ -1472,44 +1374,9 @@ impl Tenant {
|
||||
timeline.walreceiver.stop().await;
|
||||
debug!("wal receiver shutdown confirmed");
|
||||
|
||||
// Prevent new uploads from starting.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
let res = remote_client.stop();
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(e) => match e {
|
||||
remote_timeline_client::StopError::QueueUninitialized => {
|
||||
// This case shouldn't happen currently because the
|
||||
// load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
|
||||
// That is, before we declare the Tenant as Active.
|
||||
// But we only allow calls to delete_timeline on Active tenants.
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Stop & wait for the remaining timeline tasks, including upload tasks.
|
||||
// NB: This and other delete_timeline calls do not run as a task_mgr task,
|
||||
// so, they are not affected by this shutdown_tasks() call.
|
||||
info!("waiting for timeline tasks to shutdown");
|
||||
task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
|
||||
|
||||
// Mark timeline as deleted in S3 so we won't pick it up next time
|
||||
// during attach or pageserver restart.
|
||||
// See comment in persist_index_part_with_deleted_flag.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
match remote_client.persist_index_part_with_deleted_flag().await {
|
||||
// If we (now, or already) marked it successfully as deleted, we can proceed
|
||||
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
|
||||
// Bail out otherwise
|
||||
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
|
||||
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// Grab the layer_removal_cs lock, and actually perform the deletion.
|
||||
//
|
||||
@@ -1533,54 +1400,19 @@ impl Tenant {
|
||||
// by the caller.
|
||||
|
||||
let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
|
||||
|
||||
fail::fail_point!("timeline-delete-before-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
||||
});
|
||||
|
||||
// NB: This need not be atomic because the deleted flag in the IndexPart
|
||||
// will be observed during tenant/timeline load. The deletion will be resumed there.
|
||||
//
|
||||
// For configurations without remote storage, we tolerate that we're not crash-safe here.
|
||||
// The timeline may come up Active but with missing layer files, in such setups.
|
||||
// See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
|
||||
match std::fs::remove_dir_all(&local_timeline_directory) {
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
// This can happen if we're called a second time, e.g.,
|
||||
// because of a previous failure/cancellation at/after
|
||||
// failpoint timeline-delete-after-rm.
|
||||
//
|
||||
// It can also happen if we race with tenant detach, because,
|
||||
// it doesn't grab the layer_removal_cs lock.
|
||||
//
|
||||
// For now, log and continue.
|
||||
// warn! level is technically not appropriate for the
|
||||
// first case because we should expect retries to happen.
|
||||
// But the error is so rare, it seems better to get attention if it happens.
|
||||
let tenant_state = self.current_state();
|
||||
warn!(
|
||||
timeline_dir=?local_timeline_directory,
|
||||
?tenant_state,
|
||||
"timeline directory not found, proceeding anyway"
|
||||
);
|
||||
// continue with the rest of the deletion
|
||||
}
|
||||
res => res.with_context(|| {
|
||||
format!(
|
||||
"Failed to remove local timeline directory '{}'",
|
||||
local_timeline_directory.display()
|
||||
)
|
||||
})?,
|
||||
}
|
||||
// XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
|
||||
// with some layers missing.
|
||||
std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
|
||||
format!(
|
||||
"Failed to remove local timeline directory '{}'",
|
||||
local_timeline_directory.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
|
||||
drop(layer_removal_guard);
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||
});
|
||||
|
||||
// Remove the timeline from the map.
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let children_exist = timelines
|
||||
@@ -2195,7 +2027,7 @@ impl Tenant {
|
||||
// made.
|
||||
break;
|
||||
}
|
||||
let result = timeline.gc(ctx).await?;
|
||||
let result = timeline.gc().await?;
|
||||
totals += result;
|
||||
}
|
||||
|
||||
@@ -2403,18 +2235,17 @@ impl Tenant {
|
||||
src_timeline.initdb_lsn,
|
||||
src_timeline.pg_version,
|
||||
);
|
||||
|
||||
let new_timeline = {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
self.prepare_timeline(
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let new_timeline = self
|
||||
.prepare_timeline(
|
||||
dst_id,
|
||||
&metadata,
|
||||
timeline_uninit_mark,
|
||||
false,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
)?
|
||||
.initialize_with_lock(ctx, &mut timelines, true, true)?
|
||||
};
|
||||
.initialize_with_lock(ctx, &mut timelines, true, true)?;
|
||||
drop(timelines);
|
||||
|
||||
// Root timeline gets its layers during creation and uploads them along with the metadata.
|
||||
// A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
|
||||
@@ -2672,7 +2503,6 @@ impl Tenant {
|
||||
// `max_retention_period` overrides the cutoff that is used to calculate the size
|
||||
// (only if it is shorter than the real cutoff).
|
||||
max_retention_period: Option<u64>,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<size::ModelInputs> {
|
||||
let logical_sizes_at_once = self
|
||||
@@ -2694,7 +2524,6 @@ impl Tenant {
|
||||
logical_sizes_at_once,
|
||||
max_retention_period,
|
||||
&mut shared_cache,
|
||||
cause,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -2704,12 +2533,8 @@ impl Tenant {
|
||||
/// This is periodically called by background worker.
|
||||
/// result is cached in tenant struct
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||
pub async fn calculate_synthetic_size(
|
||||
&self,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<u64> {
|
||||
let inputs = self.gather_size_inputs(None, cause, ctx).await?;
|
||||
pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
|
||||
let inputs = self.gather_size_inputs(None, ctx).await?;
|
||||
|
||||
let size = inputs.calculate()?;
|
||||
|
||||
@@ -3469,26 +3294,14 @@ mod tests {
|
||||
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
|
||||
.await?;
|
||||
|
||||
// The branchpoints should contain all timelines, even ones marked
|
||||
// as Broken.
|
||||
{
|
||||
let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
|
||||
assert_eq!(branchpoints.len(), 1);
|
||||
assert_eq!(branchpoints[0], Lsn(0x40));
|
||||
}
|
||||
|
||||
// You can read the key from the child branch even though the parent is
|
||||
// Broken, as long as you don't need to access data from the parent.
|
||||
assert_eq!(
|
||||
newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
|
||||
TEST_IMG(&format!("foo at {}", Lsn(0x70)))
|
||||
newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
|
||||
TEST_IMG(&format!("foo at {}", Lsn(0x40)))
|
||||
);
|
||||
|
||||
// This needs to traverse to the parent, and fails.
|
||||
let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
|
||||
assert!(err
|
||||
.to_string()
|
||||
.contains("will not become active. Current state: Broken"));
|
||||
let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
|
||||
assert_eq!(branchpoints.len(), 1);
|
||||
assert_eq!(branchpoints[0], Lsn(0x40));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -3754,7 +3567,7 @@ mod tests {
|
||||
.await?;
|
||||
tline.freeze_and_flush().await?;
|
||||
tline.compact(&ctx).await?;
|
||||
tline.gc(&ctx).await?;
|
||||
tline.gc().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -3826,7 +3639,7 @@ mod tests {
|
||||
.await?;
|
||||
tline.freeze_and_flush().await?;
|
||||
tline.compact(&ctx).await?;
|
||||
tline.gc(&ctx).await?;
|
||||
tline.gc().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -3910,7 +3723,7 @@ mod tests {
|
||||
.await?;
|
||||
tline.freeze_and_flush().await?;
|
||||
tline.compact(&ctx).await?;
|
||||
tline.gc(&ctx).await?;
|
||||
tline.gc().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -292,93 +292,77 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
|
||||
move || format!("Cannot parse `{field_name}` duration {value:?}")
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn from_request(
|
||||
checkpoint_distance: Option<u64>,
|
||||
checkpoint_timeout: &Option<String>,
|
||||
compaction_target_size: Option<u64>,
|
||||
compaction_period: &Option<String>,
|
||||
compaction_threshold: Option<usize>,
|
||||
gc_horizon: Option<u64>,
|
||||
gc_period: &Option<String>,
|
||||
image_creation_threshold: Option<usize>,
|
||||
pitr_interval: &Option<String>,
|
||||
walreceiver_connect_timeout: &Option<String>,
|
||||
lagging_wal_timeout: &Option<String>,
|
||||
max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
trace_read_requests: Option<bool>,
|
||||
eviction_policy: &Option<serde_json::Value>,
|
||||
min_resident_size_override: Option<u64>,
|
||||
evictions_low_residence_duration_metric_threshold: &Option<String>,
|
||||
) -> Result<Self, anyhow::Error> {
|
||||
impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(request_data: &TenantCreateRequest) -> Result<Self, Self::Error> {
|
||||
let mut tenant_conf = TenantConfOpt::default();
|
||||
|
||||
if let Some(gc_period) = &gc_period {
|
||||
if let Some(gc_period) = &request_data.gc_period {
|
||||
tenant_conf.gc_period = Some(
|
||||
humantime::parse_duration(gc_period)
|
||||
.with_context(bad_duration("gc_period", gc_period))?,
|
||||
);
|
||||
}
|
||||
tenant_conf.gc_horizon = gc_horizon;
|
||||
tenant_conf.image_creation_threshold = image_creation_threshold;
|
||||
tenant_conf.gc_horizon = request_data.gc_horizon;
|
||||
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
||||
|
||||
if let Some(pitr_interval) = &pitr_interval {
|
||||
if let Some(pitr_interval) = &request_data.pitr_interval {
|
||||
tenant_conf.pitr_interval = Some(
|
||||
humantime::parse_duration(pitr_interval)
|
||||
.with_context(bad_duration("pitr_interval", pitr_interval))?,
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(walreceiver_connect_timeout) = &walreceiver_connect_timeout {
|
||||
if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
|
||||
tenant_conf.walreceiver_connect_timeout = Some(
|
||||
humantime::parse_duration(walreceiver_connect_timeout).with_context(
|
||||
bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
|
||||
)?,
|
||||
);
|
||||
}
|
||||
if let Some(lagging_wal_timeout) = &lagging_wal_timeout {
|
||||
if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
|
||||
tenant_conf.lagging_wal_timeout = Some(
|
||||
humantime::parse_duration(lagging_wal_timeout)
|
||||
.with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
|
||||
);
|
||||
}
|
||||
if let Some(max_lsn_wal_lag) = max_lsn_wal_lag {
|
||||
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
|
||||
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
||||
}
|
||||
if let Some(trace_read_requests) = trace_read_requests {
|
||||
if let Some(trace_read_requests) = request_data.trace_read_requests {
|
||||
tenant_conf.trace_read_requests = Some(trace_read_requests);
|
||||
}
|
||||
|
||||
tenant_conf.checkpoint_distance = checkpoint_distance;
|
||||
if let Some(checkpoint_timeout) = &checkpoint_timeout {
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
|
||||
tenant_conf.checkpoint_timeout = Some(
|
||||
humantime::parse_duration(checkpoint_timeout)
|
||||
.with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
|
||||
);
|
||||
}
|
||||
|
||||
tenant_conf.compaction_target_size = compaction_target_size;
|
||||
tenant_conf.compaction_threshold = compaction_threshold;
|
||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||
|
||||
if let Some(compaction_period) = &compaction_period {
|
||||
if let Some(compaction_period) = &request_data.compaction_period {
|
||||
tenant_conf.compaction_period = Some(
|
||||
humantime::parse_duration(compaction_period)
|
||||
.with_context(bad_duration("compaction_period", compaction_period))?,
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(eviction_policy) = &eviction_policy {
|
||||
if let Some(eviction_policy) = &request_data.eviction_policy {
|
||||
tenant_conf.eviction_policy = Some(
|
||||
serde::Deserialize::deserialize(eviction_policy)
|
||||
.context("parse field `eviction_policy`")?,
|
||||
);
|
||||
}
|
||||
|
||||
tenant_conf.min_resident_size_override = min_resident_size_override;
|
||||
tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
|
||||
|
||||
if let Some(evictions_low_residence_duration_metric_threshold) =
|
||||
&evictions_low_residence_duration_metric_threshold
|
||||
&request_data.evictions_low_residence_duration_metric_threshold
|
||||
{
|
||||
tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
|
||||
humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
|
||||
@@ -393,53 +377,81 @@ impl TenantConfOpt {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(request_data: &TenantCreateRequest) -> Result<Self, Self::Error> {
|
||||
Self::from_request(
|
||||
request_data.checkpoint_distance,
|
||||
&request_data.checkpoint_timeout,
|
||||
request_data.compaction_target_size,
|
||||
&request_data.compaction_period,
|
||||
request_data.compaction_threshold,
|
||||
request_data.gc_horizon,
|
||||
&request_data.gc_period,
|
||||
request_data.image_creation_threshold,
|
||||
&request_data.pitr_interval,
|
||||
&request_data.walreceiver_connect_timeout,
|
||||
&request_data.lagging_wal_timeout,
|
||||
request_data.max_lsn_wal_lag,
|
||||
request_data.trace_read_requests,
|
||||
&request_data.eviction_policy,
|
||||
request_data.min_resident_size_override,
|
||||
&request_data.evictions_low_residence_duration_metric_threshold,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&'_ TenantConfigRequest> for TenantConfOpt {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(request_data: &TenantConfigRequest) -> Result<Self, Self::Error> {
|
||||
Self::from_request(
|
||||
request_data.checkpoint_distance,
|
||||
&request_data.checkpoint_timeout,
|
||||
request_data.compaction_target_size,
|
||||
&request_data.compaction_period,
|
||||
request_data.compaction_threshold,
|
||||
request_data.gc_horizon,
|
||||
&request_data.gc_period,
|
||||
request_data.image_creation_threshold,
|
||||
&request_data.pitr_interval,
|
||||
&request_data.walreceiver_connect_timeout,
|
||||
&request_data.lagging_wal_timeout,
|
||||
request_data.max_lsn_wal_lag,
|
||||
request_data.trace_read_requests,
|
||||
&request_data.eviction_policy,
|
||||
request_data.min_resident_size_override,
|
||||
&request_data.evictions_low_residence_duration_metric_threshold,
|
||||
)
|
||||
let mut tenant_conf = TenantConfOpt::default();
|
||||
if let Some(gc_period) = &request_data.gc_period {
|
||||
tenant_conf.gc_period = Some(
|
||||
humantime::parse_duration(gc_period)
|
||||
.with_context(bad_duration("gc_period", gc_period))?,
|
||||
);
|
||||
}
|
||||
tenant_conf.gc_horizon = request_data.gc_horizon;
|
||||
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
||||
|
||||
if let Some(pitr_interval) = &request_data.pitr_interval {
|
||||
tenant_conf.pitr_interval = Some(
|
||||
humantime::parse_duration(pitr_interval)
|
||||
.with_context(bad_duration("pitr_interval", pitr_interval))?,
|
||||
);
|
||||
}
|
||||
if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
|
||||
tenant_conf.walreceiver_connect_timeout = Some(
|
||||
humantime::parse_duration(walreceiver_connect_timeout).with_context(
|
||||
bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
|
||||
)?,
|
||||
);
|
||||
}
|
||||
if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
|
||||
tenant_conf.lagging_wal_timeout = Some(
|
||||
humantime::parse_duration(lagging_wal_timeout)
|
||||
.with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
|
||||
);
|
||||
}
|
||||
tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag;
|
||||
tenant_conf.trace_read_requests = request_data.trace_read_requests;
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
|
||||
tenant_conf.checkpoint_timeout = Some(
|
||||
humantime::parse_duration(checkpoint_timeout)
|
||||
.with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
|
||||
);
|
||||
}
|
||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||
|
||||
if let Some(compaction_period) = &request_data.compaction_period {
|
||||
tenant_conf.compaction_period = Some(
|
||||
humantime::parse_duration(compaction_period)
|
||||
.with_context(bad_duration("compaction_period", compaction_period))?,
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(eviction_policy) = &request_data.eviction_policy {
|
||||
tenant_conf.eviction_policy = Some(
|
||||
serde::Deserialize::deserialize(eviction_policy)
|
||||
.context("parse field `eviction_policy`")?,
|
||||
);
|
||||
}
|
||||
|
||||
tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
|
||||
|
||||
if let Some(evictions_low_residence_duration_metric_threshold) =
|
||||
&request_data.evictions_low_residence_duration_metric_threshold
|
||||
{
|
||||
tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
|
||||
humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
|
||||
.with_context(bad_duration(
|
||||
"evictions_low_residence_duration_metric_threshold",
|
||||
evictions_low_residence_duration_metric_threshold,
|
||||
))?,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(tenant_conf)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@ use std::io::Write;
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info_span;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -183,7 +182,7 @@ impl TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> Result<Vec<u8>, SerializeError> {
|
||||
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
|
||||
let body_bytes = self.body.ser()?;
|
||||
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
|
||||
let hdr = TimelineMetadataHeader {
|
||||
|
||||
@@ -186,20 +186,10 @@ pub fn schedule_local_tenant_processing(
|
||||
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
||||
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
|
||||
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
|
||||
}
|
||||
}
|
||||
Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx)
|
||||
} else {
|
||||
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
|
||||
Tenant::create_broken_tenant(
|
||||
conf,
|
||||
tenant_id,
|
||||
"attaching mark file present but no remote storage configured".to_string(),
|
||||
)
|
||||
Tenant::create_broken_tenant(conf, tenant_id)
|
||||
}
|
||||
} else {
|
||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||
@@ -476,8 +466,7 @@ pub async fn attach_tenant(
|
||||
"Cannot attach tenant {tenant_id}, local tenant directory already exists"
|
||||
);
|
||||
|
||||
let tenant =
|
||||
Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx).context("spawn_attach")?;
|
||||
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx);
|
||||
vacant_entry.insert(tenant);
|
||||
Ok(())
|
||||
})
|
||||
|
||||
@@ -204,11 +204,8 @@ mod download;
|
||||
pub mod index;
|
||||
mod upload;
|
||||
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
// re-export these
|
||||
pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
use scopeguard::ScopeGuard;
|
||||
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
@@ -216,7 +213,7 @@ use std::sync::{Arc, Mutex};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use std::ops::DerefMut;
|
||||
use tokio::runtime::Runtime;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use tracing::{debug, info, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -243,7 +240,6 @@ use utils::id::{TenantId, TimelineId};
|
||||
use self::index::IndexPart;
|
||||
|
||||
use super::storage_layer::LayerFileName;
|
||||
use super::upload_queue::SetDeletedFlagProgress;
|
||||
|
||||
// Occasional network issues and such can cause remote operations to fail, and
|
||||
// that's expected. If a download fails, we log it at info-level, and retry.
|
||||
@@ -257,30 +253,6 @@ const FAILED_DOWNLOAD_RETRIES: u32 = 10;
|
||||
// retries. Uploads and deletions are retried forever, though.
|
||||
const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
Deleted,
|
||||
}
|
||||
|
||||
/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum StopError {
|
||||
/// Returned if the upload queue was never initialized.
|
||||
/// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
|
||||
#[error("queue is not initialized")]
|
||||
QueueUninitialized,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum PersistIndexPartWithDeletedFlagError {
|
||||
#[error("another task is already setting the deleted_flag, started at {0:?}")]
|
||||
AlreadyInProgress(NaiveDateTime),
|
||||
#[error("the deleted_flag was already set, value is {0:?}")]
|
||||
AlreadyDeleted(NaiveDateTime),
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
/// A client for accessing a timeline's data in remote storage.
|
||||
///
|
||||
/// This takes care of managing the number of connections, and balancing them
|
||||
@@ -395,7 +367,7 @@ impl RemoteTimelineClient {
|
||||
//
|
||||
|
||||
/// Download index file
|
||||
pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
|
||||
pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
|
||||
let _unfinished_gauge_guard = self.metrics.call_begin(
|
||||
&RemoteOpFileKind::Index,
|
||||
&RemoteOpKind::Download,
|
||||
@@ -404,7 +376,7 @@ impl RemoteTimelineClient {
|
||||
},
|
||||
);
|
||||
|
||||
let index_part = download::download_index_part(
|
||||
download::download_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
self.tenant_id,
|
||||
@@ -417,13 +389,7 @@ impl RemoteTimelineClient {
|
||||
RemoteOpKind::Download,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await?;
|
||||
|
||||
if index_part.deleted_at.is_some() {
|
||||
Ok(MaybeDeletedIndexPart::Deleted)
|
||||
} else {
|
||||
Ok(MaybeDeletedIndexPart::IndexPart(index_part))
|
||||
}
|
||||
.await
|
||||
}
|
||||
|
||||
/// Download a (layer) file from `path`, into local filesystem.
|
||||
@@ -658,116 +624,6 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set the deleted_at field in the remote index file.
|
||||
///
|
||||
/// This fails if the upload queue has not been `stop()`ed.
|
||||
///
|
||||
/// The caller is responsible for calling `stop()` AND for waiting
|
||||
/// for any ongoing upload tasks to finish after `stop()` has succeeded.
|
||||
/// Check method [`RemoteTimelineClient::stop`] for details.
|
||||
pub(crate) async fn persist_index_part_with_deleted_flag(
|
||||
self: &Arc<Self>,
|
||||
) -> Result<(), PersistIndexPartWithDeletedFlagError> {
|
||||
let index_part_with_deleted_at = {
|
||||
let mut locked = self.upload_queue.lock().unwrap();
|
||||
|
||||
// We must be in stopped state because otherwise
|
||||
// we can have inprogress index part upload that can overwrite the file
|
||||
// with missing is_deleted flag that we going to set below
|
||||
let stopped = match &mut *locked {
|
||||
UploadQueue::Uninitialized => {
|
||||
return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
|
||||
}
|
||||
UploadQueue::Initialized(_) => {
|
||||
return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
|
||||
}
|
||||
UploadQueue::Stopped(stopped) => stopped,
|
||||
};
|
||||
|
||||
match stopped.deleted_at {
|
||||
SetDeletedFlagProgress::NotRunning => (), // proceed
|
||||
SetDeletedFlagProgress::InProgress(at) => {
|
||||
return Err(PersistIndexPartWithDeletedFlagError::AlreadyInProgress(at));
|
||||
}
|
||||
SetDeletedFlagProgress::Successful(at) => {
|
||||
return Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(at));
|
||||
}
|
||||
};
|
||||
let deleted_at = Utc::now().naive_utc();
|
||||
stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
|
||||
|
||||
let mut index_part = IndexPart::new(
|
||||
stopped.latest_files.clone(),
|
||||
stopped.last_uploaded_consistent_lsn,
|
||||
stopped
|
||||
.latest_metadata
|
||||
.to_bytes()
|
||||
.context("serialize metadata")?,
|
||||
);
|
||||
index_part.deleted_at = Some(deleted_at);
|
||||
index_part
|
||||
};
|
||||
|
||||
let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
|
||||
let mut locked = self_clone.upload_queue.lock().unwrap();
|
||||
let stopped = match &mut *locked {
|
||||
UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
|
||||
"there's no way out of Stopping, and we checked it's Stopping above: {:?}",
|
||||
locked.as_str(),
|
||||
),
|
||||
UploadQueue::Stopped(stopped) => stopped,
|
||||
};
|
||||
stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
|
||||
});
|
||||
|
||||
// Have a failpoint that can use the `pause` failpoint action.
|
||||
// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
#[cfg(feature = "testing")]
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!(
|
||||
"at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
|
||||
);
|
||||
fail::fail_point!(
|
||||
"persist_index_part_with_deleted_flag_after_set_before_upload_pause"
|
||||
);
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
&index_part_with_deleted_at,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// all good, disarm the guard and mark as success
|
||||
ScopeGuard::into_inner(undo_deleted_at);
|
||||
{
|
||||
let mut locked = self.upload_queue.lock().unwrap();
|
||||
let stopped = match &mut *locked {
|
||||
UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
|
||||
"there's no way out of Stopping, and we checked it's Stopping above: {:?}",
|
||||
locked.as_str(),
|
||||
),
|
||||
UploadQueue::Stopped(stopped) => stopped,
|
||||
};
|
||||
stopped.deleted_at = SetDeletedFlagProgress::Successful(
|
||||
index_part_with_deleted_at
|
||||
.deleted_at
|
||||
.expect("we set it above"),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Pick next tasks from the queue, and start as many of them as possible without violating
|
||||
/// the ordering constraints.
|
||||
@@ -885,13 +741,8 @@ impl RemoteTimelineClient {
|
||||
// upload finishes or times out soon enough.
|
||||
if task_mgr::is_shutdown_requested() {
|
||||
info!("upload task cancelled by shutdown request");
|
||||
match self.stop() {
|
||||
Ok(()) => {}
|
||||
Err(StopError::QueueUninitialized) => {
|
||||
unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
|
||||
}
|
||||
}
|
||||
self.calls_unfinished_metric_end(&task.op);
|
||||
self.stop();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1095,48 +946,32 @@ impl RemoteTimelineClient {
|
||||
self.metrics.call_end(&file_kind, &op_kind, track_bytes);
|
||||
}
|
||||
|
||||
/// Close the upload queue for new operations and cancel queued operations.
|
||||
/// In-progress operations will still be running after this function returns.
|
||||
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
||||
/// to wait for them to complete, after calling this function.
|
||||
pub fn stop(&self) -> Result<(), StopError> {
|
||||
fn stop(&self) {
|
||||
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
||||
// into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
|
||||
// The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
match &mut *guard {
|
||||
UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
|
||||
match &*guard {
|
||||
UploadQueue::Uninitialized => panic!(
|
||||
"callers are responsible for ensuring this is only called on initialized queue"
|
||||
),
|
||||
UploadQueue::Stopped(_) => {
|
||||
// nothing to do
|
||||
info!("another concurrent task already shut down the queue");
|
||||
Ok(())
|
||||
}
|
||||
UploadQueue::Initialized(UploadQueueInitialized {
|
||||
latest_files,
|
||||
latest_metadata,
|
||||
last_uploaded_consistent_lsn,
|
||||
..
|
||||
}) => {
|
||||
UploadQueue::Initialized(qi) => {
|
||||
info!("shutting down upload queue");
|
||||
|
||||
// Replace the queue with the Stopped state, taking ownership of the old
|
||||
// Initialized queue. We will do some checks on it, and then drop it.
|
||||
let qi = {
|
||||
// take or clone what we need
|
||||
let latest_files = std::mem::take(latest_files);
|
||||
let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
|
||||
// this could be Copy
|
||||
let latest_metadata = latest_metadata.clone();
|
||||
|
||||
let stopped = UploadQueueStopped {
|
||||
latest_files,
|
||||
last_uploaded_consistent_lsn,
|
||||
latest_metadata,
|
||||
deleted_at: SetDeletedFlagProgress::NotRunning,
|
||||
};
|
||||
|
||||
let upload_queue =
|
||||
std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
|
||||
let last_uploaded_consistent_lsn = qi.last_uploaded_consistent_lsn;
|
||||
let upload_queue = std::mem::replace(
|
||||
&mut *guard,
|
||||
UploadQueue::Stopped(UploadQueueStopped {
|
||||
last_uploaded_consistent_lsn,
|
||||
}),
|
||||
);
|
||||
if let UploadQueue::Initialized(qi) = upload_queue {
|
||||
qi
|
||||
} else {
|
||||
@@ -1144,8 +979,6 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
};
|
||||
|
||||
assert!(qi.latest_files.is_empty(), "do not use this anymore");
|
||||
|
||||
// consistency check
|
||||
assert_eq!(
|
||||
qi.num_inprogress_layer_uploads
|
||||
@@ -1169,7 +1002,6 @@ impl RemoteTimelineClient {
|
||||
|
||||
// We're done.
|
||||
drop(guard);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1408,11 +1240,7 @@ mod tests {
|
||||
}
|
||||
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let index_part = match runtime.block_on(client.download_index_file())? {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
|
||||
MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
|
||||
};
|
||||
|
||||
let index_part = runtime.block_on(client.download_index_file())?;
|
||||
assert_file_list(
|
||||
&index_part.timeline_layers,
|
||||
&[
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
@@ -56,10 +55,6 @@ pub struct IndexPart {
|
||||
#[serde(default)]
|
||||
version: usize,
|
||||
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_at: Option<NaiveDateTime>,
|
||||
|
||||
/// Layer names, which are stored on the remote storage.
|
||||
///
|
||||
/// Additional metadata can might exist in `layer_metadata`.
|
||||
@@ -83,7 +78,7 @@ impl IndexPart {
|
||||
/// used to understand later versions.
|
||||
///
|
||||
/// Version is currently informative only.
|
||||
const LATEST_VERSION: usize = 2;
|
||||
const LATEST_VERSION: usize = 1;
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
pub fn new(
|
||||
@@ -106,7 +101,6 @@ impl IndexPart {
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
deleted_at: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -162,7 +156,6 @@ mod tests {
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
deleted_at: None,
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
@@ -199,7 +192,6 @@ mod tests {
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
deleted_at: None,
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
@@ -244,7 +236,6 @@ mod tests {
|
||||
0, 0,
|
||||
]
|
||||
.to_vec(),
|
||||
deleted_at: None,
|
||||
};
|
||||
|
||||
let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
|
||||
|
||||
@@ -19,12 +19,9 @@ pub(super) async fn upload_index_part<'a>(
|
||||
timeline_id: TimelineId,
|
||||
index_part: &'a IndexPart,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::trace!("uploading new index part");
|
||||
|
||||
fail_point!("before-upload-index", |_| {
|
||||
bail!("failpoint before-upload-index")
|
||||
});
|
||||
|
||||
let index_part_bytes = serde_json::to_vec(&index_part)
|
||||
.context("Failed to serialize index part file into bytes")?;
|
||||
let index_part_size = index_part_bytes.len();
|
||||
@@ -34,7 +31,6 @@ pub(super) async fn upload_index_part<'a>(
|
||||
.metadata_path(timeline_id, tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let storage_path = conf.remote_path(&index_part_path)?;
|
||||
|
||||
storage
|
||||
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
|
||||
.await
|
||||
|
||||
@@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
|
||||
use super::{LogicalSizeCalculationCause, Tenant};
|
||||
use super::Tenant;
|
||||
use crate::tenant::Timeline;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -126,7 +126,6 @@ pub(super) async fn gather_inputs(
|
||||
limit: &Arc<Semaphore>,
|
||||
max_retention_period: Option<u64>,
|
||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ModelInputs> {
|
||||
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
||||
@@ -319,15 +318,7 @@ pub(super) async fn gather_inputs(
|
||||
|
||||
// We left the 'size' field empty in all of the Segments so far.
|
||||
// Now find logical sizes for all of the points that might need or benefit from them.
|
||||
fill_logical_sizes(
|
||||
&timelines,
|
||||
&mut segments,
|
||||
limit,
|
||||
logical_size_cache,
|
||||
cause,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
fill_logical_sizes(&timelines, &mut segments, limit, logical_size_cache, ctx).await?;
|
||||
|
||||
Ok(ModelInputs {
|
||||
segments,
|
||||
@@ -345,7 +336,6 @@ async fn fill_logical_sizes(
|
||||
segments: &mut [SegmentMeta],
|
||||
limit: &Arc<Semaphore>,
|
||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
|
||||
@@ -383,17 +373,13 @@ async fn fill_logical_sizes(
|
||||
let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
|
||||
let parallel_size_calcs = Arc::clone(limit);
|
||||
let ctx = ctx.attached_child();
|
||||
joinset.spawn(
|
||||
calculate_logical_size(
|
||||
parallel_size_calcs,
|
||||
timeline,
|
||||
lsn,
|
||||
cause,
|
||||
ctx,
|
||||
cancel.child_token(),
|
||||
)
|
||||
.in_current_span(),
|
||||
);
|
||||
joinset.spawn(calculate_logical_size(
|
||||
parallel_size_calcs,
|
||||
timeline,
|
||||
lsn,
|
||||
ctx,
|
||||
cancel.child_token(),
|
||||
));
|
||||
}
|
||||
e.insert(cached_size);
|
||||
}
|
||||
@@ -496,7 +482,6 @@ async fn calculate_logical_size(
|
||||
limit: Arc<tokio::sync::Semaphore>,
|
||||
timeline: Arc<crate::tenant::Timeline>,
|
||||
lsn: utils::lsn::Lsn,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<TimelineAtLsnSizeResult, RecvError> {
|
||||
@@ -505,7 +490,7 @@ async fn calculate_logical_size(
|
||||
.expect("global semaphore should not had been closed");
|
||||
|
||||
let size_res = timeline
|
||||
.spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
|
||||
.spawn_ondemand_logical_size_calculation(lsn, ctx, cancel)
|
||||
.instrument(info_span!("spawn_ondemand_logical_size_calculation"))
|
||||
.await?;
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
|
||||
|
||||
@@ -259,7 +259,6 @@ pub(crate) async fn random_init_delay(
|
||||
}
|
||||
}
|
||||
|
||||
/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
|
||||
pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
|
||||
// Duration::ZERO will happen because it's the "disable [bgtask]" value.
|
||||
if elapsed >= period && period != Duration::ZERO {
|
||||
@@ -272,8 +271,5 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task
|
||||
task,
|
||||
"task iteration took longer than the configured period"
|
||||
);
|
||||
crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
|
||||
.with_label_values(&[task, &format!("{}", period.as_secs())])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,7 +22,8 @@ use tracing::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -47,7 +48,7 @@ use crate::tenant::{
|
||||
};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
@@ -122,17 +123,6 @@ pub struct Timeline {
|
||||
|
||||
pub(super) layers: RwLock<LayerMap<dyn PersistentLayer>>,
|
||||
|
||||
/// Set of key ranges which should be covered by image layers to
|
||||
/// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
|
||||
/// It is used by compaction task when it checks if new image layer should be created.
|
||||
/// Newly created image layer doesn't help to remove the delta layer, until the
|
||||
/// newly created image layer falls off the PITR horizon. So on next GC cycle,
|
||||
/// gc_timeline may still want the new image layer to be created. To avoid redundant
|
||||
/// image layers creation we should check if image layer exists but beyond PITR horizon.
|
||||
/// This is why we need remember GC cutoff LSN.
|
||||
///
|
||||
wanted_image_layers: Mutex<Option<(Lsn, KeySpace)>>,
|
||||
|
||||
last_freeze_at: AtomicLsn,
|
||||
// Atomic would be more appropriate here.
|
||||
last_freeze_ts: RwLock<Instant>,
|
||||
@@ -406,9 +396,6 @@ pub enum PageReconstructError {
|
||||
/// The operation was cancelled
|
||||
Cancelled,
|
||||
|
||||
/// The ancestor of this is being stopped
|
||||
AncestorStopping(TimelineId),
|
||||
|
||||
/// An error happened replaying WAL records
|
||||
#[error(transparent)]
|
||||
WalRedo(#[from] crate::walredo::WalRedoError),
|
||||
@@ -427,9 +414,6 @@ impl std::fmt::Debug for PageReconstructError {
|
||||
)
|
||||
}
|
||||
Self::Cancelled => write!(f, "cancelled"),
|
||||
Self::AncestorStopping(timeline_id) => {
|
||||
write!(f, "ancestor timeline {timeline_id} is being stopped")
|
||||
}
|
||||
Self::WalRedo(err) => err.fmt(f),
|
||||
}
|
||||
}
|
||||
@@ -448,22 +432,11 @@ impl std::fmt::Display for PageReconstructError {
|
||||
)
|
||||
}
|
||||
Self::Cancelled => write!(f, "cancelled"),
|
||||
Self::AncestorStopping(timeline_id) => {
|
||||
write!(f, "ancestor timeline {timeline_id} is being stopped")
|
||||
}
|
||||
Self::WalRedo(err) => err.fmt(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum LogicalSizeCalculationCause {
|
||||
Initial,
|
||||
ConsumptionMetricsSyntheticSize,
|
||||
EvictionTaskImitation,
|
||||
TenantSizeHandler,
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
@@ -953,31 +926,6 @@ impl Timeline {
|
||||
self.state.subscribe()
|
||||
}
|
||||
|
||||
pub async fn wait_to_become_active(
|
||||
&self,
|
||||
_ctx: &RequestContext, /* Prepare for use by cancellation */
|
||||
) -> Result<(), TimelineState> {
|
||||
let mut receiver = self.state.subscribe();
|
||||
loop {
|
||||
let current_state = *receiver.borrow_and_update();
|
||||
match current_state {
|
||||
TimelineState::Loading => {
|
||||
receiver
|
||||
.changed()
|
||||
.await
|
||||
.expect("holding a reference to self");
|
||||
}
|
||||
TimelineState::Active { .. } => {
|
||||
return Ok(());
|
||||
}
|
||||
TimelineState::Broken { .. } | TimelineState::Stopping => {
|
||||
// There's no chance the timeline can transition back into ::Active
|
||||
return Err(current_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
|
||||
let layer_map = self.layers.read().unwrap();
|
||||
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
|
||||
@@ -1364,7 +1312,6 @@ impl Timeline {
|
||||
tenant_id,
|
||||
pg_version,
|
||||
layers: RwLock::new(LayerMap::default()),
|
||||
wanted_image_layers: Mutex::new(None),
|
||||
|
||||
walredo_mgr,
|
||||
walreceiver,
|
||||
@@ -1892,31 +1839,18 @@ impl Timeline {
|
||||
// to spawn_ondemand_logical_size_calculation.
|
||||
let cancel = CancellationToken::new();
|
||||
let calculated_size = match self_clone
|
||||
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
|
||||
.logical_size_calculation_task(lsn, &background_ctx, cancel)
|
||||
.await
|
||||
{
|
||||
Ok(s) => s,
|
||||
Err(CalculateLogicalSizeError::Cancelled) => {
|
||||
// Don't make noise, this is a common task.
|
||||
// In the unlikely case that there is another call to this function, we'll retry
|
||||
// In the unlikely case that there ihs another call to this function, we'll retry
|
||||
// because initial_logical_size is still None.
|
||||
info!("initial size calculation cancelled, likely timeline delete / tenant detach");
|
||||
return Ok(());
|
||||
}
|
||||
Err(CalculateLogicalSizeError::Other(err)) => {
|
||||
if let Some(e @ PageReconstructError::AncestorStopping(_)) =
|
||||
err.root_cause().downcast_ref()
|
||||
{
|
||||
// This can happen if the timeline parent timeline switches to
|
||||
// Stopping state while we're still calculating the initial
|
||||
// timeline size for the child, for example if the tenant is
|
||||
// being detached or the pageserver is shut down. Like with
|
||||
// CalculateLogicalSizeError::Cancelled, don't make noise.
|
||||
info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}");
|
||||
return Ok(());
|
||||
}
|
||||
return Err(err.context("Failed to calculate logical size"));
|
||||
}
|
||||
x @ Err(_) => x.context("Failed to calculate logical size")?,
|
||||
};
|
||||
|
||||
// we cannot query current_logical_size.current_size() to know the current
|
||||
@@ -1952,14 +1886,13 @@ impl Timeline {
|
||||
// so that we prevent future callers from spawning this task
|
||||
permit.forget();
|
||||
Ok(())
|
||||
}.in_current_span(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
pub fn spawn_ondemand_logical_size_calculation(
|
||||
self: &Arc<Self>,
|
||||
lsn: Lsn,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
|
||||
@@ -1982,26 +1915,22 @@ impl Timeline {
|
||||
false,
|
||||
async move {
|
||||
let res = self_clone
|
||||
.logical_size_calculation_task(lsn, cause, &ctx, cancel)
|
||||
.logical_size_calculation_task(lsn, &ctx, cancel)
|
||||
.await;
|
||||
let _ = sender.send(res).ok();
|
||||
Ok(()) // Receiver is responsible for handling errors
|
||||
}
|
||||
.in_current_span(),
|
||||
},
|
||||
);
|
||||
receiver
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
#[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
|
||||
async fn logical_size_calculation_task(
|
||||
self: &Arc<Self>,
|
||||
lsn: Lsn,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
ctx: &RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let mut timeline_state_updates = self.subscribe_for_state_updates();
|
||||
let self_calculation = Arc::clone(self);
|
||||
|
||||
@@ -2009,7 +1938,7 @@ impl Timeline {
|
||||
let cancel = cancel.child_token();
|
||||
let ctx = ctx.attached_child();
|
||||
self_calculation
|
||||
.calculate_logical_size(lsn, cause, cancel, &ctx)
|
||||
.calculate_logical_size(lsn, cancel, &ctx)
|
||||
.await
|
||||
});
|
||||
let timeline_state_cancellation = async {
|
||||
@@ -2064,7 +1993,6 @@ impl Timeline {
|
||||
pub async fn calculate_logical_size(
|
||||
&self,
|
||||
up_to_lsn: Lsn,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
@@ -2098,15 +2026,7 @@ impl Timeline {
|
||||
if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
|
||||
return Ok(size);
|
||||
}
|
||||
let storage_time_metrics = match cause {
|
||||
LogicalSizeCalculationCause::Initial
|
||||
| LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize
|
||||
| LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo,
|
||||
LogicalSizeCalculationCause::EvictionTaskImitation => {
|
||||
&self.metrics.imitate_logical_size_histo
|
||||
}
|
||||
};
|
||||
let timer = storage_time_metrics.start_timer();
|
||||
let timer = self.metrics.logical_size_histo.start_timer();
|
||||
let logical_size = self
|
||||
.get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
|
||||
.await?;
|
||||
@@ -2298,46 +2218,6 @@ impl Timeline {
|
||||
Ok(timeline) => timeline,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
|
||||
// It's possible that the ancestor timeline isn't active yet, or
|
||||
// is active but hasn't yet caught up to the branch point. Wait
|
||||
// for it.
|
||||
//
|
||||
// This cannot happen while the pageserver is running normally,
|
||||
// because you cannot create a branch from a point that isn't
|
||||
// present in the pageserver yet. However, we don't wait for the
|
||||
// branch point to be uploaded to cloud storage before creating
|
||||
// a branch. I.e., the branch LSN need not be remote consistent
|
||||
// for the branching operation to succeed.
|
||||
//
|
||||
// Hence, if we try to load a tenant in such a state where
|
||||
// 1. the existence of the branch was persisted (in IndexPart and/or locally)
|
||||
// 2. but the ancestor state is behind branch_lsn because it was not yet persisted
|
||||
// then we will need to wait for the ancestor timeline to
|
||||
// re-stream WAL up to branch_lsn before we access it.
|
||||
//
|
||||
// How can a tenant get in such a state?
|
||||
// - ungraceful pageserver process exit
|
||||
// - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
|
||||
//
|
||||
// NB: this could be avoided by requiring
|
||||
// branch_lsn >= remote_consistent_lsn
|
||||
// during branch creation.
|
||||
match ancestor.wait_to_become_active(ctx).await {
|
||||
Ok(()) => {}
|
||||
Err(state) if state == TimelineState::Stopping => {
|
||||
return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
|
||||
}
|
||||
Err(state) => {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"Timeline {} will not become active. Current state: {:?}",
|
||||
ancestor.timeline_id,
|
||||
&state,
|
||||
)));
|
||||
}
|
||||
}
|
||||
ancestor.wait_lsn(timeline.ancestor_lsn, ctx).await?;
|
||||
|
||||
timeline_owned = ancestor;
|
||||
timeline = &*timeline_owned;
|
||||
prev_lsn = Lsn(u64::MAX);
|
||||
@@ -2915,30 +2795,6 @@ impl Timeline {
|
||||
let layers = self.layers.read().unwrap();
|
||||
|
||||
let mut max_deltas = 0;
|
||||
{
|
||||
let wanted_image_layers = self.wanted_image_layers.lock().unwrap();
|
||||
if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers {
|
||||
let img_range =
|
||||
partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
|
||||
if wanted.overlaps(&img_range) {
|
||||
//
|
||||
// gc_timeline only pays attention to image layers that are older than the GC cutoff,
|
||||
// but create_image_layers creates image layers at last-record-lsn.
|
||||
// So it's possible that gc_timeline wants a new image layer to be created for a key range,
|
||||
// but the range is already covered by image layers at more recent LSNs. Before we
|
||||
// create a new image layer, check if the range is already covered at more recent LSNs.
|
||||
if !layers
|
||||
.image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))?
|
||||
{
|
||||
debug!(
|
||||
"Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
|
||||
img_range.start, img_range.end, cutoff_lsn, lsn
|
||||
);
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for part_range in &partition.ranges {
|
||||
let image_coverage = layers.image_coverage(part_range, lsn)?;
|
||||
@@ -3058,12 +2914,6 @@ impl Timeline {
|
||||
image_layers.push(image_layer);
|
||||
}
|
||||
}
|
||||
// All layers that the GC wanted us to create have now been created.
|
||||
//
|
||||
// It's possible that another GC cycle happened while we were compacting, and added
|
||||
// something new to wanted_image_layers, and we now clear that before processing it.
|
||||
// That's OK, because the next GC iteration will put it back in.
|
||||
*self.wanted_image_layers.lock().unwrap() = None;
|
||||
|
||||
// Sync the new layer to disk before adding it to the layer map, to make sure
|
||||
// we don't garbage collect something based on the new layer, before it has
|
||||
@@ -3667,7 +3517,7 @@ impl Timeline {
|
||||
/// within a layer file. We can only remove the whole file if it's fully
|
||||
/// obsolete.
|
||||
///
|
||||
pub(super) async fn gc(&self, ctx: &RequestContext) -> anyhow::Result<GcResult> {
|
||||
pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
|
||||
let timer = self.metrics.garbage_collect_histo.start_timer();
|
||||
|
||||
fail_point!("before-timeline-gc");
|
||||
@@ -3697,7 +3547,6 @@ impl Timeline {
|
||||
pitr_cutoff,
|
||||
retain_lsns,
|
||||
new_gc_cutoff,
|
||||
ctx,
|
||||
)
|
||||
.instrument(
|
||||
info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff),
|
||||
@@ -3717,7 +3566,6 @@ impl Timeline {
|
||||
pitr_cutoff: Lsn,
|
||||
retain_lsns: Vec<Lsn>,
|
||||
new_gc_cutoff: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<GcResult> {
|
||||
let now = SystemTime::now();
|
||||
let mut result: GcResult = GcResult::default();
|
||||
@@ -3763,16 +3611,6 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let mut layers_to_remove = Vec::new();
|
||||
let mut wanted_image_layers = KeySpaceRandomAccum::default();
|
||||
// Do not collect keyspace for Unit tests
|
||||
let gc_keyspace = if ctx.task_kind() == TaskKind::GarbageCollector {
|
||||
Some(
|
||||
self.collect_keyspace(self.get_last_record_lsn(), ctx)
|
||||
.await?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Scan all layers in the timeline (remote or on-disk).
|
||||
//
|
||||
@@ -3856,21 +3694,6 @@ impl Timeline {
|
||||
"keeping {} because it is the latest layer",
|
||||
l.filename().file_name()
|
||||
);
|
||||
// Collect delta key ranges that need image layers to allow garbage
|
||||
// collecting the layers.
|
||||
// It is not so obvious whether we need to propagate information only about
|
||||
// delta layers. Image layers can form "stairs" preventing old image from been deleted.
|
||||
// But image layers are in any case less sparse than delta layers. Also we need some
|
||||
// protection from replacing recent image layers with new one after each GC iteration.
|
||||
if l.is_incremental() && !LayerMap::is_l0(&*l) {
|
||||
if let Some(keyspace) = &gc_keyspace {
|
||||
let layer_logical_size = keyspace.get_logical_size(&l.get_key_range());
|
||||
let layer_age = new_gc_cutoff.0 - l.get_lsn_range().start.0;
|
||||
if layer_logical_size <= layer_age {
|
||||
wanted_image_layers.add_range(l.get_key_range());
|
||||
}
|
||||
}
|
||||
}
|
||||
result.layers_not_updated += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
@@ -3883,10 +3706,6 @@ impl Timeline {
|
||||
);
|
||||
layers_to_remove.push(Arc::clone(&l));
|
||||
}
|
||||
self.wanted_image_layers
|
||||
.lock()
|
||||
.unwrap()
|
||||
.replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
|
||||
|
||||
let mut updates = layers.batch_update();
|
||||
if !layers_to_remove.is_empty() {
|
||||
|
||||
@@ -22,7 +22,7 @@ use std::{
|
||||
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
@@ -30,7 +30,7 @@ use crate::{
|
||||
tenant::{
|
||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||
storage_layer::PersistentLayer,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
Tenant,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -120,13 +120,6 @@ impl Timeline {
|
||||
}
|
||||
let elapsed = start.elapsed();
|
||||
crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
|
||||
crate::metrics::EVICTION_ITERATION_DURATION
|
||||
.get_metric_with_label_values(&[
|
||||
&format!("{}", p.period.as_secs()),
|
||||
&format!("{}", p.threshold.as_secs()),
|
||||
])
|
||||
.unwrap()
|
||||
.observe(elapsed.as_secs_f64());
|
||||
ControlFlow::Continue(start + p.period)
|
||||
}
|
||||
}
|
||||
@@ -283,7 +276,6 @@ impl Timeline {
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_layer_accesses(
|
||||
&self,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
@@ -332,7 +324,6 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Recompute the values which would cause on-demand downloads during restart.
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_timeline_cached_layer_accesses(
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
@@ -341,15 +332,7 @@ impl Timeline {
|
||||
let lsn = self.get_last_record_lsn();
|
||||
|
||||
// imitiate on-restart initial logical size
|
||||
let size = self
|
||||
.calculate_logical_size(
|
||||
lsn,
|
||||
LogicalSizeCalculationCause::EvictionTaskImitation,
|
||||
cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.instrument(info_span!("calculate_logical_size"))
|
||||
.await;
|
||||
let size = self.calculate_logical_size(lsn, cancel.clone(), ctx).await;
|
||||
|
||||
match &size {
|
||||
Ok(_size) => {
|
||||
@@ -364,11 +347,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// imitiate repartiting on first compactation
|
||||
if let Err(e) = self
|
||||
.collect_keyspace(lsn, ctx)
|
||||
.instrument(info_span!("collect_keyspace"))
|
||||
.await
|
||||
{
|
||||
if let Err(e) = self.collect_keyspace(lsn, ctx).await {
|
||||
// if this failed, we probably failed logical size because these use the same keys
|
||||
if size.is_err() {
|
||||
// ignore, see above comment
|
||||
@@ -381,7 +360,6 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Imitate the synthetic size calculation done by the consumption_metrics module.
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_synthetic_size_calculation_worker(
|
||||
&self,
|
||||
tenant: &Arc<Tenant>,
|
||||
@@ -419,15 +397,8 @@ impl Timeline {
|
||||
.inner();
|
||||
|
||||
let mut throwaway_cache = HashMap::new();
|
||||
let gather = crate::tenant::size::gather_inputs(
|
||||
tenant,
|
||||
limit,
|
||||
None,
|
||||
&mut throwaway_cache,
|
||||
LogicalSizeCalculationCause::EvictionTaskImitation,
|
||||
ctx,
|
||||
)
|
||||
.instrument(info_span!("gather_inputs"));
|
||||
let gather =
|
||||
crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx);
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {}
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fmt::Debug;
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use std::sync::Arc;
|
||||
use tracing::info;
|
||||
|
||||
@@ -19,14 +18,14 @@ use utils::lsn::Lsn;
|
||||
// that many upload queues in a running pageserver, and most of them are initialized
|
||||
// anyway.
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
pub(super) enum UploadQueue {
|
||||
pub(crate) enum UploadQueue {
|
||||
Uninitialized,
|
||||
Initialized(UploadQueueInitialized),
|
||||
Stopped(UploadQueueStopped),
|
||||
}
|
||||
|
||||
impl UploadQueue {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
UploadQueue::Uninitialized => "Uninitialized",
|
||||
UploadQueue::Initialized(_) => "Initialized",
|
||||
@@ -76,18 +75,8 @@ pub(crate) struct UploadQueueInitialized {
|
||||
pub(crate) queued_operations: VecDeque<UploadOp>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub(super) enum SetDeletedFlagProgress {
|
||||
NotRunning,
|
||||
InProgress(NaiveDateTime),
|
||||
Successful(NaiveDateTime),
|
||||
}
|
||||
|
||||
pub(super) struct UploadQueueStopped {
|
||||
pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
pub(super) last_uploaded_consistent_lsn: Lsn,
|
||||
pub(super) latest_metadata: TimelineMetadata,
|
||||
pub(super) deleted_at: SetDeletedFlagProgress,
|
||||
pub(crate) struct UploadQueueStopped {
|
||||
pub(crate) last_uploaded_consistent_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl UploadQueue {
|
||||
|
||||
@@ -305,15 +305,6 @@ impl<'a> WalIngest<'a> {
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
|
||||
// This is a convenient way to make the WAL ingestion pause at
|
||||
// particular point in the WAL. For more fine-grained control,
|
||||
// we could peek into the message and only pause if it contains
|
||||
// a particular string, for example, but this is enough for now.
|
||||
utils::failpoint_sleep_millis_async!("wal-ingest-logical-message-sleep");
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
|
||||
@@ -52,7 +52,7 @@ typedef struct
|
||||
#define NEON_TAG "[NEON_SMGR] "
|
||||
#define neon_log(tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
|
||||
errhidestmt(true), errhidecontext(true), internalerrposition(0)))
|
||||
|
||||
/*
|
||||
* supertype of all the Neon*Request structs below
|
||||
|
||||
@@ -108,9 +108,6 @@ struct Args {
|
||||
/// available to the system.
|
||||
#[arg(long)]
|
||||
wal_backup_threads: Option<usize>,
|
||||
/// Number of max parallel WAL segments to be offloaded to remote storage.
|
||||
#[arg(long, default_value = "5")]
|
||||
wal_backup_parallel_jobs: usize,
|
||||
/// Disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring
|
||||
/// WAL backup horizon.
|
||||
#[arg(long)]
|
||||
@@ -185,7 +182,6 @@ fn main() -> anyhow::Result<()> {
|
||||
max_offloader_lag_bytes: args.max_offloader_lag,
|
||||
backup_runtime_threads: args.wal_backup_threads,
|
||||
wal_backup_enabled: !args.disable_wal_backup,
|
||||
backup_parallel_jobs: args.wal_backup_parallel_jobs,
|
||||
auth,
|
||||
};
|
||||
|
||||
|
||||
@@ -61,7 +61,6 @@ pub struct SafeKeeperConf {
|
||||
pub remote_storage: Option<RemoteStorageConfig>,
|
||||
pub max_offloader_lag_bytes: u64,
|
||||
pub backup_runtime_threads: Option<usize>,
|
||||
pub backup_parallel_jobs: usize,
|
||||
pub wal_backup_enabled: bool,
|
||||
pub auth: Option<Arc<JwtAuth>>,
|
||||
}
|
||||
@@ -94,7 +93,6 @@ impl SafeKeeperConf {
|
||||
broker_keepalive_interval: Duration::from_secs(5),
|
||||
backup_runtime_threads: None,
|
||||
wal_backup_enabled: true,
|
||||
backup_parallel_jobs: 1,
|
||||
auth: None,
|
||||
heartbeat_timeout: Duration::new(5, 0),
|
||||
max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
use futures::stream::FuturesOrdered;
|
||||
use futures::StreamExt;
|
||||
use tokio::task::JoinHandle;
|
||||
use utils::id::NodeId;
|
||||
|
||||
@@ -157,14 +155,8 @@ async fn update_task(
|
||||
let timeline_dir = conf.timeline_dir(&ttid);
|
||||
|
||||
let handle = tokio::spawn(
|
||||
backup_task_main(
|
||||
ttid,
|
||||
timeline_dir,
|
||||
conf.workdir.clone(),
|
||||
conf.backup_parallel_jobs,
|
||||
shutdown_rx,
|
||||
)
|
||||
.instrument(info_span!("WAL backup task", ttid = %ttid)),
|
||||
backup_task_main(ttid, timeline_dir, conf.workdir.clone(), shutdown_rx)
|
||||
.instrument(info_span!("WAL backup task", ttid = %ttid)),
|
||||
);
|
||||
|
||||
entry.handle = Some(WalBackupTaskHandle {
|
||||
@@ -248,7 +240,6 @@ struct WalBackupTask {
|
||||
timeline_dir: PathBuf,
|
||||
workspace_dir: PathBuf,
|
||||
wal_seg_size: usize,
|
||||
parallel_jobs: usize,
|
||||
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
||||
}
|
||||
|
||||
@@ -257,7 +248,6 @@ async fn backup_task_main(
|
||||
ttid: TenantTimelineId,
|
||||
timeline_dir: PathBuf,
|
||||
workspace_dir: PathBuf,
|
||||
parallel_jobs: usize,
|
||||
mut shutdown_rx: Receiver<()>,
|
||||
) {
|
||||
info!("started");
|
||||
@@ -274,7 +264,6 @@ async fn backup_task_main(
|
||||
timeline: tli,
|
||||
timeline_dir,
|
||||
workspace_dir,
|
||||
parallel_jobs,
|
||||
};
|
||||
|
||||
// task is spinned up only when wal_seg_size already initialized
|
||||
@@ -341,7 +330,6 @@ impl WalBackupTask {
|
||||
self.wal_seg_size,
|
||||
&self.timeline_dir,
|
||||
&self.workspace_dir,
|
||||
self.parallel_jobs,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -368,49 +356,20 @@ pub async fn backup_lsn_range(
|
||||
wal_seg_size: usize,
|
||||
timeline_dir: &Path,
|
||||
workspace_dir: &Path,
|
||||
parallel_jobs: usize,
|
||||
) -> Result<()> {
|
||||
if parallel_jobs < 1 {
|
||||
anyhow::bail!("parallel_jobs must be >= 1");
|
||||
}
|
||||
|
||||
let start_lsn = *backup_lsn;
|
||||
let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
|
||||
for s in &segments {
|
||||
backup_single_segment(s, timeline_dir, workspace_dir)
|
||||
.await
|
||||
.with_context(|| format!("offloading segno {}", s.seg_no))?;
|
||||
|
||||
// Pool of concurrent upload tasks. We use `FuturesOrdered` to
|
||||
// preserve order of uploads, and update `backup_lsn` only after
|
||||
// all previous uploads are finished.
|
||||
let mut uploads = FuturesOrdered::new();
|
||||
let mut iter = segments.iter();
|
||||
|
||||
loop {
|
||||
let added_task = match iter.next() {
|
||||
Some(s) => {
|
||||
uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir));
|
||||
true
|
||||
}
|
||||
None => false,
|
||||
};
|
||||
|
||||
// Wait for the next segment to upload if we don't have any more segments,
|
||||
// or if we have too many concurrent uploads.
|
||||
if !added_task || uploads.len() >= parallel_jobs {
|
||||
let next = uploads.next().await;
|
||||
if let Some(res) = next {
|
||||
// next segment uploaded
|
||||
let segment = res?;
|
||||
let new_backup_lsn = segment.end_lsn;
|
||||
timeline
|
||||
.set_wal_backup_lsn(new_backup_lsn)
|
||||
.context("setting wal_backup_lsn")?;
|
||||
*backup_lsn = new_backup_lsn;
|
||||
} else {
|
||||
// no more segments to upload
|
||||
break;
|
||||
}
|
||||
}
|
||||
let new_backup_lsn = s.end_lsn;
|
||||
timeline
|
||||
.set_wal_backup_lsn(new_backup_lsn)
|
||||
.context("setting wal_backup_lsn")?;
|
||||
*backup_lsn = new_backup_lsn;
|
||||
}
|
||||
|
||||
info!(
|
||||
"offloaded segnos {:?} up to {}, previous backup_lsn {}",
|
||||
segments.iter().map(|&s| s.seg_no).collect::<Vec<_>>(),
|
||||
@@ -424,7 +383,7 @@ async fn backup_single_segment(
|
||||
seg: &Segment,
|
||||
timeline_dir: &Path,
|
||||
workspace_dir: &Path,
|
||||
) -> Result<Segment> {
|
||||
) -> Result<()> {
|
||||
let segment_file_path = seg.file_path(timeline_dir)?;
|
||||
let remote_segment_path = segment_file_path
|
||||
.strip_prefix(workspace_dir)
|
||||
@@ -445,7 +404,7 @@ async fn backup_single_segment(
|
||||
res?;
|
||||
debug!("Backup of {} done", segment_file_path.display());
|
||||
|
||||
Ok(*seg)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
|
||||
@@ -21,7 +21,6 @@ FLAKY_TESTS_QUERY = """
|
||||
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'retriesStatusChange' as retries_status_change,
|
||||
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
|
||||
FROM
|
||||
regress_test_results
|
||||
@@ -30,7 +29,7 @@ FLAKY_TESTS_QUERY = """
|
||||
) data
|
||||
WHERE
|
||||
timestamp > CURRENT_DATE - INTERVAL '%s' day
|
||||
AND (status::text IN ('"failed"', '"broken"') OR retries_status_change::boolean)
|
||||
AND status::text IN ('"failed"', '"broken"')
|
||||
;
|
||||
"""
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
//
|
||||
// The script parses Allure reports and posts a comment with a summary of the test results to the PR.
|
||||
// It accepts an array of items and creates a comment with a summary for each one (for "release" and "debug", together or separately if any of them failed to be generated).
|
||||
//
|
||||
// The comment is updated on each run with the latest results.
|
||||
//
|
||||
@@ -12,37 +13,19 @@
|
||||
// github,
|
||||
// context,
|
||||
// fetch,
|
||||
// report: {
|
||||
// reportUrl: "...",
|
||||
// reportJsonUrl: "...",
|
||||
// },
|
||||
// reports: [{...}, ...], // each report is expected to have "buildType", "reportUrl", and "jsonUrl" properties
|
||||
// })
|
||||
//
|
||||
|
||||
// Analog of Python's defaultdict.
|
||||
//
|
||||
// const dm = new DefaultMap(() => new DefaultMap(() => []))
|
||||
// dm["firstKey"]["secondKey"].push("value")
|
||||
//
|
||||
class DefaultMap extends Map {
|
||||
constructor(getDefaultValue) {
|
||||
return new Proxy({}, {
|
||||
get: (target, name) => name in target ? target[name] : (target[name] = getDefaultValue(name))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = async ({ github, context, fetch, report }) => {
|
||||
module.exports = async ({ github, context, fetch, reports }) => {
|
||||
// Marker to find the comment in the subsequent runs
|
||||
const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
|
||||
// Let users know that the comment is updated automatically
|
||||
const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results :recycle:</sub></div>`
|
||||
// GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
|
||||
const githubActionsBotId = 41898282
|
||||
// The latest commit in the PR URL
|
||||
const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.payload.pull_request.head.sha}`
|
||||
// Commend body itself
|
||||
let commentBody = `${startMarker}\n`
|
||||
let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n`
|
||||
|
||||
// Common parameters for GitHub API requests
|
||||
const ownerRepoParams = {
|
||||
@@ -50,119 +33,76 @@ module.exports = async ({ github, context, fetch, report }) => {
|
||||
repo: context.repo.repo,
|
||||
}
|
||||
|
||||
const {reportUrl, reportJsonUrl} = report
|
||||
for (const report of reports) {
|
||||
const {buildType, reportUrl, jsonUrl} = report
|
||||
|
||||
if (!reportUrl || !reportJsonUrl) {
|
||||
commentBody += `#### No tests were run or test report is not available\n`
|
||||
commentBody += autoupdateNotice
|
||||
return
|
||||
}
|
||||
if (!reportUrl || !jsonUrl) {
|
||||
commentBody += `#### ${buildType} build: no tests were run or test report is not available\n`
|
||||
continue
|
||||
}
|
||||
|
||||
const suites = await (await fetch(reportJsonUrl)).json()
|
||||
const suites = await (await fetch(jsonUrl)).json()
|
||||
|
||||
// Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
|
||||
// For this report it's ok to treat them in the same way (as failed).
|
||||
const failedTests = new DefaultMap(() => new DefaultMap(() => []))
|
||||
const passedTests = new DefaultMap(() => new DefaultMap(() => []))
|
||||
const skippedTests = new DefaultMap(() => new DefaultMap(() => []))
|
||||
const retriedTests = new DefaultMap(() => new DefaultMap(() => []))
|
||||
const flakyTests = new DefaultMap(() => new DefaultMap(() => []))
|
||||
// Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
|
||||
// For this report it's ok to treat them in the same way (as failed).
|
||||
failedTests = []
|
||||
passedTests = []
|
||||
skippedTests = []
|
||||
|
||||
let failedTestsCount = 0
|
||||
let passedTestsCount = 0
|
||||
let skippedTestsCount = 0
|
||||
let flakyTestsCount = 0
|
||||
retriedTests = []
|
||||
retriedStatusChangedTests = []
|
||||
|
||||
const pgVersions = new Set()
|
||||
const buildTypes = new Set()
|
||||
for (const parentSuite of suites.children) {
|
||||
for (const suite of parentSuite.children) {
|
||||
for (const test of suite.children) {
|
||||
pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${test.name}`
|
||||
test.pytestName = pytestName
|
||||
|
||||
for (const parentSuite of suites.children) {
|
||||
for (const suite of parentSuite.children) {
|
||||
for (const test of suite.children) {
|
||||
let buildType, pgVersion
|
||||
const match = test.name.match(/[\[-](?<buildType>debug|release)-pg(?<pgVersion>\d+)[-\]]/)?.groups
|
||||
if (match) {
|
||||
({buildType, pgVersion} = match)
|
||||
} else {
|
||||
// It's ok, we embed BUILD_TYPE and Postgres Version into the test name only for regress suite and do not for other suites (like performance).
|
||||
console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)
|
||||
if (test.status === "passed") {
|
||||
passedTests.push(test);
|
||||
} else if (test.status === "failed" || test.status === "broken") {
|
||||
failedTests.push(test);
|
||||
} else if (test.status === "skipped") {
|
||||
skippedTests.push(test);
|
||||
}
|
||||
|
||||
buildType = "release"
|
||||
pgVersion = "14"
|
||||
}
|
||||
if (test.retriesCount > 0) {
|
||||
retriedTests.push(test);
|
||||
|
||||
pgVersions.add(pgVersion)
|
||||
buildTypes.add(buildType)
|
||||
|
||||
// Removing build type and PostgreSQL version from the test name to make it shorter
|
||||
const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
|
||||
test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}`
|
||||
|
||||
if (test.status === "passed") {
|
||||
passedTests[pgVersion][buildType].push(test)
|
||||
passedTestsCount += 1
|
||||
} else if (test.status === "failed" || test.status === "broken") {
|
||||
failedTests[pgVersion][buildType].push(test)
|
||||
failedTestsCount += 1
|
||||
} else if (test.status === "skipped") {
|
||||
skippedTests[pgVersion][buildType].push(test)
|
||||
skippedTestsCount += 1
|
||||
}
|
||||
|
||||
if (test.retriesCount > 0) {
|
||||
retriedTests[pgVersion][buildType].push(test)
|
||||
|
||||
if (test.retriesStatusChange) {
|
||||
flakyTests[pgVersion][buildType].push(test)
|
||||
flakyTestsCount += 1
|
||||
if (test.retriedStatusChangedTests) {
|
||||
retriedStatusChangedTests.push(test);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const totalTestsCount = failedTestsCount + passedTestsCount + skippedTestsCount
|
||||
commentBody += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}) for ${commitUrl})\n___\n`
|
||||
const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length
|
||||
commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} skipped ([full report](${reportUrl}))\n`
|
||||
if (failedTests.length > 0) {
|
||||
commentBody += `Failed tests:\n`
|
||||
for (const test of failedTests) {
|
||||
const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
|
||||
|
||||
// Print test resuls from the newest to the oldest PostgreSQL version for release and debug builds.
|
||||
for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
|
||||
for (const buildType of Array.from(buildTypes).sort().reverse()) {
|
||||
if (failedTests[pgVersion][buildType].length > 0) {
|
||||
commentBody += `#### PostgreSQL ${pgVersion} (${buildType} build)\n\n`
|
||||
commentBody += `Failed tests:\n`
|
||||
for (const test of failedTests[pgVersion][buildType]) {
|
||||
const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
|
||||
|
||||
commentBody += `- [\`${test.pytestName}\`](${allureLink})`
|
||||
if (test.retriesCount > 0) {
|
||||
commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))`
|
||||
}
|
||||
commentBody += "\n"
|
||||
commentBody += `- [\`${test.pytestName}\`](${allureLink})`
|
||||
if (test.retriesCount > 0) {
|
||||
commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))`
|
||||
}
|
||||
commentBody += "\n"
|
||||
}
|
||||
commentBody += "\n"
|
||||
}
|
||||
}
|
||||
|
||||
if (flakyTestsCount > 0) {
|
||||
commentBody += "<details>\n<summary>Flaky tests</summary>\n\n"
|
||||
for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
|
||||
for (const buildType of Array.from(buildTypes).sort().reverse()) {
|
||||
if (flakyTests[pgVersion][buildType].length > 0) {
|
||||
commentBody += `#### PostgreSQL ${pgVersion} (${buildType} build)\n\n`
|
||||
for (const test of flakyTests[pgVersion][buildType]) {
|
||||
const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
|
||||
commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n`
|
||||
}
|
||||
commentBody += "\n"
|
||||
}
|
||||
if (retriedStatusChangedTests > 0) {
|
||||
commentBody += `Flaky tests:\n`
|
||||
for (const test of retriedStatusChangedTests) {
|
||||
const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
|
||||
commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n`
|
||||
}
|
||||
commentBody += "\n"
|
||||
}
|
||||
commentBody += "\n</details>\n"
|
||||
commentBody += "___\n"
|
||||
}
|
||||
|
||||
commentBody += autoupdateNotice
|
||||
|
||||
const { data: comments } = await github.rest.issues.listComments({
|
||||
issue_number: context.payload.number,
|
||||
...ownerRepoParams,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
pytest_plugins = (
|
||||
"fixtures.pg_version",
|
||||
"fixtures.allure",
|
||||
"fixtures.neon_fixtures",
|
||||
"fixtures.benchmark_fixture",
|
||||
"fixtures.pg_stats",
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from fixtures.pg_version import DEFAULT_VERSION, PgVersion
|
||||
|
||||
"""
|
||||
Set of utilities to make Allure report more informative.
|
||||
|
||||
- It adds BUILD_TYPE and DEFAULT_PG_VERSION to the test names (only in test_runner/regress)
|
||||
to make tests distinguishable in Allure report.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def allure_noop():
|
||||
pass
|
||||
|
||||
|
||||
def pytest_generate_tests(metafunc):
|
||||
if "test_runner/regress" in metafunc.definition._nodeid:
|
||||
build_type = os.environ.get("BUILD_TYPE", "DEBUG").lower()
|
||||
pg_version = PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION))
|
||||
|
||||
metafunc.parametrize("allure_noop", [f"{build_type}-pg{pg_version}"])
|
||||
@@ -451,17 +451,13 @@ def pytest_terminal_summary(
|
||||
revision = os.getenv("GITHUB_SHA", "local")
|
||||
platform = os.getenv("PLATFORM", "local")
|
||||
|
||||
is_header_printed = False
|
||||
terminalreporter.section("Benchmark results", "-")
|
||||
|
||||
result = []
|
||||
for test_report in terminalreporter.stats.get("passed", []):
|
||||
result_entry = []
|
||||
|
||||
for _, recorded_property in test_report.user_properties:
|
||||
if not is_header_printed:
|
||||
terminalreporter.section("Benchmark results", "-")
|
||||
is_header_printed = True
|
||||
|
||||
terminalreporter.write(
|
||||
"{}.{}: ".format(test_report.head_line, recorded_property["name"])
|
||||
)
|
||||
@@ -489,6 +485,7 @@ def pytest_terminal_summary(
|
||||
|
||||
out_dir = config.getoption("out_dir")
|
||||
if out_dir is None:
|
||||
warnings.warn("no out dir provided to store performance test results")
|
||||
return
|
||||
|
||||
if not result:
|
||||
|
||||
@@ -272,7 +272,6 @@ class PageserverHttpClient(requests.Session):
|
||||
new_timeline_id: Optional[TimelineId] = None,
|
||||
ancestor_timeline_id: Optional[TimelineId] = None,
|
||||
ancestor_start_lsn: Optional[Lsn] = None,
|
||||
**kwargs,
|
||||
) -> Dict[Any, Any]:
|
||||
body: Dict[str, Any] = {
|
||||
"new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
|
||||
@@ -282,9 +281,7 @@ class PageserverHttpClient(requests.Session):
|
||||
if pg_version != PgVersion.NOT_SET:
|
||||
body["pg_version"] = int(pg_version)
|
||||
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body, **kwargs
|
||||
)
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body)
|
||||
self.verbose_error(res)
|
||||
if res.status_code == 409:
|
||||
raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
|
||||
@@ -317,9 +314,9 @@ class PageserverHttpClient(requests.Session):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs):
|
||||
def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", **kwargs
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
|
||||
@@ -87,9 +87,7 @@ def wait_until_tenant_state(
|
||||
|
||||
time.sleep(period)
|
||||
|
||||
raise Exception(
|
||||
f"Tenant {tenant_id} did not become {expected_state} within {iterations * period} seconds"
|
||||
)
|
||||
raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
|
||||
|
||||
|
||||
def wait_until_tenant_active(
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user