mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 13:30:38 +00:00
Compare commits
84 Commits
always-fai
...
remove-lay
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
db1ea8b430 | ||
|
|
2dad3a6445 | ||
|
|
d36fd4f141 | ||
|
|
a8a6603e66 | ||
|
|
87c82f5151 | ||
|
|
c9a2b1fd10 | ||
|
|
fdfa86b5b0 | ||
|
|
3eb85957df | ||
|
|
6b4a28bf7f | ||
|
|
22d6c1dda6 | ||
|
|
abfac6ef2a | ||
|
|
e51f2be3d0 | ||
|
|
4e4bc8fbde | ||
|
|
dd2a77c2ef | ||
|
|
ef95637c65 | ||
|
|
206b5d2ada | ||
|
|
77fea61fcc | ||
|
|
e3f4c0e4ac | ||
|
|
2302ecda04 | ||
|
|
5257bbe2b9 | ||
|
|
6b61ed5fab | ||
|
|
f28bf70596 | ||
|
|
dc9c33139b | ||
|
|
0260ee23b9 | ||
|
|
c2f5d011c7 | ||
|
|
1022b4b98f | ||
|
|
791eebefe2 | ||
|
|
50b686c3e4 | ||
|
|
39b10696e9 | ||
|
|
264b0ada9f | ||
|
|
78338f7b94 | ||
|
|
0d533ce840 | ||
|
|
978f1879b9 | ||
|
|
c863d679f8 | ||
|
|
28e9eb6539 | ||
|
|
d07a4d02bb | ||
|
|
c61731a31f | ||
|
|
5a55dce282 | ||
|
|
e6de1b0e8c | ||
|
|
58be279be1 | ||
|
|
d1b92e976a | ||
|
|
7d46c7c118 | ||
|
|
d6c1a9aa18 | ||
|
|
49f2eac934 | ||
|
|
5c8387aff1 | ||
|
|
3c4680b718 | ||
|
|
6b7cbec9b3 | ||
|
|
6de9a33c05 | ||
|
|
77f883c95c | ||
|
|
e6557f4f91 | ||
|
|
e8db20eb26 | ||
|
|
7552e2d25f | ||
|
|
dfb4160403 | ||
|
|
f9bdc030e8 | ||
|
|
e4c9b83a39 | ||
|
|
a78c16328e | ||
|
|
eed99b7251 | ||
|
|
a2fbd93e91 | ||
|
|
66f8f686a0 | ||
|
|
919f2b261a | ||
|
|
9d273c840a | ||
|
|
6600e1f896 | ||
|
|
348369414b | ||
|
|
3890acaf7f | ||
|
|
f537a7a873 | ||
|
|
71bc45a21b | ||
|
|
decef74503 | ||
|
|
8aed805933 | ||
|
|
b8488e70a9 | ||
|
|
16fdd104ac | ||
|
|
bb6dbd2f43 | ||
|
|
c4c4558736 | ||
|
|
bfdc09cf4a | ||
|
|
1839ce0545 | ||
|
|
8e04f0455e | ||
|
|
6839773538 | ||
|
|
2a96c4cfcd | ||
|
|
027cf22663 | ||
|
|
f4daa877b5 | ||
|
|
ed28ced3bc | ||
|
|
d7c120574b | ||
|
|
c9188ffa67 | ||
|
|
c631fa1f50 | ||
|
|
795c3ca131 |
@@ -11,3 +11,6 @@ opt-level = 3
|
||||
[profile.dev]
|
||||
# Turn on a small amount of optimization in Development mode.
|
||||
opt-level = 1
|
||||
|
||||
[alias]
|
||||
build_testing = ["build", "--features", "testing"]
|
||||
|
||||
39
.github/actions/allure-report/action.yml
vendored
39
.github/actions/allure-report/action.yml
vendored
@@ -32,8 +32,8 @@ runs:
|
||||
exit 2
|
||||
fi
|
||||
|
||||
- name: Calculate variables
|
||||
id: calculate-vars
|
||||
- name: Calculate key
|
||||
id: calculate-key
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key
|
||||
@@ -41,22 +41,14 @@ runs:
|
||||
pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${pr_number}" != "null" ]; then
|
||||
key=pr-${pr_number}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ]; then
|
||||
elif [ "${GITHUB_REF}" = "refs/heads/main" ]; then
|
||||
# Shortcut for a special branch
|
||||
key=main
|
||||
elif [ "${GITHUB_REF_NAME}" = "release" ]; then
|
||||
# Shortcut for a special branch
|
||||
key=release
|
||||
else
|
||||
key=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
|
||||
key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-")
|
||||
fi
|
||||
echo "KEY=${key}" >> $GITHUB_OUTPUT
|
||||
|
||||
# Sanitize test selection to remove `/` and any other special characters
|
||||
# Use printf instead of echo to avoid having `\n` at the end of the string
|
||||
test_selection=$(printf "${{ inputs.test_selection }}" | tr -c "[:alnum:]._-" "-" )
|
||||
echo "TEST_SELECTION=${test_selection}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/setup-java@v3
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
with:
|
||||
@@ -82,11 +74,10 @@ runs:
|
||||
- name: Upload Allure results
|
||||
if: ${{ inputs.action == 'store' }}
|
||||
env:
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Add metadata
|
||||
@@ -107,7 +98,7 @@ runs:
|
||||
BUILD_TYPE=${{ inputs.build_type }}
|
||||
EOF
|
||||
|
||||
ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
|
||||
ARCHIVE="${GITHUB_RUN_ID}-${{ inputs.test_selection }}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
|
||||
ZSTD_NBTHREADS=0
|
||||
|
||||
tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
|
||||
@@ -118,9 +109,8 @@ runs:
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
|
||||
LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
run: |
|
||||
LOCK_TIMEOUT=300 # seconds
|
||||
|
||||
@@ -133,12 +123,12 @@ runs:
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" > lock.txt
|
||||
echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" > lock.txt
|
||||
aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}"
|
||||
|
||||
# A double-check that exactly WE have acquired the lock
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
|
||||
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
|
||||
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
@@ -147,8 +137,8 @@ runs:
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
id: generate-report
|
||||
env:
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUCKET: neon-github-public-dev
|
||||
shell: bash -euxo pipefail {0}
|
||||
@@ -202,13 +192,12 @@ runs:
|
||||
if: ${{ inputs.action == 'generate' && always() }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
|
||||
LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
run: |
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
|
||||
|
||||
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
|
||||
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
|
||||
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
|
||||
fi
|
||||
|
||||
|
||||
154
.github/actions/neon-branch-create/action.yml
vendored
154
.github/actions/neon-branch-create/action.yml
vendored
@@ -1,154 +0,0 @@
|
||||
name: 'Create Branch'
|
||||
description: 'Create Branch using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
environment:
|
||||
desctiption: 'dev (aka captest) or staging'
|
||||
required: true
|
||||
project_id:
|
||||
desctiption: 'ID of the Project to create Branch in'
|
||||
required: true
|
||||
outputs:
|
||||
dsn:
|
||||
description: 'Created Branch DSN (for main database)'
|
||||
value: ${{ steps.change-password.outputs.dsn }}
|
||||
branch_id:
|
||||
description: 'Created Branch ID'
|
||||
value: ${{ steps.create-branch.outputs.branch_id }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Parse Input
|
||||
id: parse-input
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
case "${ENVIRONMENT}" in
|
||||
dev)
|
||||
API_HOST=console.dev.neon.tech
|
||||
;;
|
||||
staging)
|
||||
API_HOST=console.stage.neon.tech
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
ENVIRONMENT: ${{ inputs.environment }}
|
||||
|
||||
- name: Create New Branch
|
||||
id: create-branch
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
for i in $(seq 1 10); do
|
||||
branch=$(curl \
|
||||
"https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches" \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}" \
|
||||
--data "{
|
||||
\"branch\": {
|
||||
\"name\": \"Created by actions/neon-branch-create; GITHUB_RUN_ID=${GITHUB_RUN_ID} at $(date +%s)\"
|
||||
}
|
||||
}")
|
||||
|
||||
if [ -z "${branch}" ]; then
|
||||
sleep 1
|
||||
continue
|
||||
fi
|
||||
|
||||
branch_id=$(echo $branch | jq --raw-output '.branch.id')
|
||||
if [ "${branch_id}" == "null" ]; then
|
||||
sleep 1
|
||||
continue
|
||||
fi
|
||||
|
||||
break
|
||||
done
|
||||
|
||||
if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then
|
||||
echo 2>&1 "Failed to create branch after 10 attempts, the latest response was: ${branch}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
branch_id=$(echo $branch | jq --raw-output '.branch.id')
|
||||
echo "branch_id=${branch_id}" >> $GITHUB_OUTPUT
|
||||
|
||||
host=$(echo $branch | jq --raw-output '.endpoints[0].host')
|
||||
echo "host=${host}" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
API_HOST: ${{ steps.parse-input.outputs.api_host }}
|
||||
PROJECT_ID: ${{ inputs.project_id }}
|
||||
|
||||
- name: Get Role name
|
||||
id: role-name
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
roles=$(curl \
|
||||
"https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}/roles" \
|
||||
--fail \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}"
|
||||
)
|
||||
|
||||
role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
|
||||
echo "role_name=${role_name}" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
API_HOST: ${{ steps.parse-input.outputs.api_host }}
|
||||
PROJECT_ID: ${{ inputs.project_id }}
|
||||
BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }}
|
||||
|
||||
- name: Change Password
|
||||
id: change-password
|
||||
# A shell without `set -x` to not to expose password/dsn in logs
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
for i in $(seq 1 10); do
|
||||
reset_password=$(curl \
|
||||
"https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}/roles/${ROLE_NAME}/reset_password" \
|
||||
--request POST \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}"
|
||||
)
|
||||
|
||||
if [ -z "${reset_password}" ]; then
|
||||
sleep 1
|
||||
continue
|
||||
fi
|
||||
|
||||
password=$(echo $reset_password | jq --raw-output '.role.password')
|
||||
if [ "${password}" == "null" ]; then
|
||||
sleep 1
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "::add-mask::${password}"
|
||||
break
|
||||
done
|
||||
|
||||
if [ -z "${password}" ] || [ "${password}" == "null" ]; then
|
||||
echo 2>&1 "Failed to reset password after 10 attempts, the latest response was: ${reset_password}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
dsn="postgres://${ROLE_NAME}:${password}@${HOST}/neondb"
|
||||
echo "::add-mask::${dsn}"
|
||||
echo "dsn=${dsn}" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
API_HOST: ${{ steps.parse-input.outputs.api_host }}
|
||||
PROJECT_ID: ${{ inputs.project_id }}
|
||||
BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }}
|
||||
ROLE_NAME: ${{ steps.role-name.outputs.role_name }}
|
||||
HOST: ${{ steps.create-branch.outputs.host }}
|
||||
79
.github/actions/neon-branch-delete/action.yml
vendored
79
.github/actions/neon-branch-delete/action.yml
vendored
@@ -1,79 +0,0 @@
|
||||
name: 'Delete Branch'
|
||||
description: 'Delete Branch using API'
|
||||
|
||||
inputs:
|
||||
api_key:
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
environment:
|
||||
desctiption: 'dev (aka captest) or staging'
|
||||
required: true
|
||||
project_id:
|
||||
desctiption: 'ID of the Project which should be deleted'
|
||||
required: true
|
||||
branch_id:
|
||||
desctiption: 'ID of the branch to delete'
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Parse Input
|
||||
id: parse-input
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
case "${ENVIRONMENT}" in
|
||||
dev)
|
||||
API_HOST=console.dev.neon.tech
|
||||
;;
|
||||
staging)
|
||||
API_HOST=console.stage.neon.tech
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
ENVIRONMENT: ${{ inputs.environment }}
|
||||
|
||||
- name: Delete Branch
|
||||
# Do not try to delete a branch if .github/actions/neon-project-create
|
||||
# or .github/actions/neon-branch-create failed before
|
||||
if: ${{ inputs.project_id != '' && inputs.branch_id != '' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
for i in $(seq 1 10); do
|
||||
deleted_branch=$(curl \
|
||||
"https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}" \
|
||||
--request DELETE \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}"
|
||||
)
|
||||
|
||||
if [ -z "${deleted_branch}" ]; then
|
||||
sleep 1
|
||||
continue
|
||||
fi
|
||||
|
||||
branch_id=$(echo $deleted_branch | jq --raw-output '.branch.id')
|
||||
if [ "${branch_id}" == "null" ]; then
|
||||
sleep 1
|
||||
continue
|
||||
fi
|
||||
|
||||
break
|
||||
done
|
||||
|
||||
if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then
|
||||
echo 2>&1 "Failed to delete branch after 10 attempts, the latest response was: ${deleted_branch}"
|
||||
exit 1
|
||||
fi
|
||||
env:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
PROJECT_ID: ${{ inputs.project_id }}
|
||||
BRANCH_ID: ${{ inputs.branch_id }}
|
||||
API_HOST: ${{ steps.parse-input.outputs.api_host }}
|
||||
13
.github/actions/neon-project-create/action.yml
vendored
13
.github/actions/neon-project-create/action.yml
vendored
@@ -6,7 +6,7 @@ inputs:
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
environment:
|
||||
desctiption: 'dev (aka captest) or staging'
|
||||
desctiption: 'dev (aka captest) or stage'
|
||||
required: true
|
||||
region_id:
|
||||
desctiption: 'Region ID, if not set the project will be created in the default region'
|
||||
@@ -29,11 +29,11 @@ runs:
|
||||
case "${ENVIRONMENT}" in
|
||||
dev)
|
||||
API_HOST=console.dev.neon.tech
|
||||
REGION_ID=${REGION_ID:-aws-eu-west-1}
|
||||
REGION_ID=${REGION_ID:-eu-west-1}
|
||||
;;
|
||||
staging)
|
||||
API_HOST=console.stage.neon.tech
|
||||
REGION_ID=${REGION_ID:-aws-us-east-2}
|
||||
REGION_ID=${REGION_ID:-us-east-1}
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
|
||||
@@ -53,7 +53,7 @@ runs:
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
project=$(curl \
|
||||
"https://${API_HOST}/api/v2/projects" \
|
||||
"https://${API_HOST}/api/v1/projects" \
|
||||
--fail \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
@@ -61,6 +61,7 @@ runs:
|
||||
--data "{
|
||||
\"project\": {
|
||||
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
|
||||
\"platform_id\": \"aws\",
|
||||
\"region_id\": \"${REGION_ID}\",
|
||||
\"settings\": { }
|
||||
}
|
||||
@@ -69,11 +70,11 @@ runs:
|
||||
# Mask password
|
||||
echo "::add-mask::$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .password')"
|
||||
|
||||
dsn=$(echo $project | jq --raw-output '.connection_uris[0].connection_uri')
|
||||
dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main
|
||||
echo "::add-mask::${dsn}"
|
||||
echo "dsn=${dsn}" >> $GITHUB_OUTPUT
|
||||
|
||||
project_id=$(echo $project | jq --raw-output '.project.id')
|
||||
project_id=$(echo $project | jq --raw-output '.id')
|
||||
echo "project_id=${project_id}" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
|
||||
20
.github/actions/neon-project-delete/action.yml
vendored
20
.github/actions/neon-project-delete/action.yml
vendored
@@ -6,7 +6,7 @@ inputs:
|
||||
desctiption: 'Neon API key'
|
||||
required: true
|
||||
environment:
|
||||
desctiption: 'dev (aka captest) or staging'
|
||||
desctiption: 'dev (aka captest) or stage'
|
||||
required: true
|
||||
project_id:
|
||||
desctiption: 'ID of the Project to delete'
|
||||
@@ -37,17 +37,17 @@ runs:
|
||||
ENVIRONMENT: ${{ inputs.environment }}
|
||||
|
||||
- name: Delete Neon Project
|
||||
# Do not try to delete a project if .github/actions/neon-project-create failed before
|
||||
if: ${{ inputs.project_id != '' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
curl \
|
||||
"https://${API_HOST}/api/v2/projects/${PROJECT_ID}" \
|
||||
--fail \
|
||||
--request DELETE \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}"
|
||||
# Allow PROJECT_ID to be empty/null for cases when .github/actions/neon-project-create failed
|
||||
if [ -n "${PROJECT_ID}" ]; then
|
||||
curl -X "POST" \
|
||||
"https://${API_HOST}/api/v1/projects/${PROJECT_ID}/delete" \
|
||||
--fail \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}"
|
||||
fi
|
||||
env:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
PROJECT_ID: ${{ inputs.project_id }}
|
||||
|
||||
6
.github/ansible/scripts/init_pageserver.sh
vendored
6
.github/ansible/scripts/init_pageserver.sh
vendored
@@ -1,8 +1,7 @@
|
||||
#!/bin/sh
|
||||
|
||||
# fetch params from meta-data service
|
||||
# get instance id from meta-data service
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)
|
||||
|
||||
# store fqdn hostname in var
|
||||
HOST=$(hostname -f)
|
||||
@@ -17,8 +16,7 @@ cat <<EOF | tee /tmp/payload
|
||||
"instance_id": "${INSTANCE_ID}",
|
||||
"http_host": "${HOST}",
|
||||
"http_port": 9898,
|
||||
"active": false,
|
||||
"availability_zone_id": "${AZ_ID}"
|
||||
"active": false
|
||||
}
|
||||
EOF
|
||||
|
||||
|
||||
2
.github/ansible/systemd/pageserver.service
vendored
2
.github/ansible/systemd/pageserver.service
vendored
@@ -5,7 +5,7 @@ After=network.target auditd.service
|
||||
[Service]
|
||||
Type=simple
|
||||
User=pageserver
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }}
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
|
||||
2
.github/ansible/systemd/safekeeper.service
vendored
2
.github/ansible/systemd/safekeeper.service
vendored
@@ -5,7 +5,7 @@ After=network.target auditd.service
|
||||
[Service]
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }}
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: staging
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx-internal
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
- host: storage-broker-zeta.eu-west-1.aws.neon.build
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- storage-broker-zeta.eu-west-1.aws.neon.build
|
||||
secretName: storage-broker-tls
|
||||
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram-legacy
|
||||
zenith_env: dev
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
@@ -1,53 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: staging
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx-internal
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
- host: storage-broker-beta.us-east-2.aws.neon.build
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- storage-broker-beta.us-east-2.aws.neon.build
|
||||
secretName: storage-broker-tls
|
||||
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: neon-stress
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: alb
|
||||
alb.ingress.kubernetes.io/healthcheck-path: /status
|
||||
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
|
||||
alb.ingress.kubernetes.io/scheme: "internal"
|
||||
alb.ingress.kubernetes.io/target-type: "ip"
|
||||
alb.ingress.kubernetes.io/ssl-redirect: "443"
|
||||
alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
|
||||
|
||||
hosts:
|
||||
- host: storage-broker-stress.stage.neon.tech
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx-internal
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
- host: storage-broker-epsilon.ap-southeast-1.aws.neon.tech
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- storage-broker-epsilon.ap-southeast-1.aws.neon.tech
|
||||
secretName: storage-broker-tls
|
||||
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx-internal
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
- host: storage-broker-gamma.eu-central-1.aws.neon.tech
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- storage-broker-gamma.eu-central-1.aws.neon.tech
|
||||
secretName: storage-broker-tls
|
||||
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx-internal
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
- host: storage-broker-delta.us-east-2.aws.neon.tech
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- storage-broker-delta.us-east-2.aws.neon.tech
|
||||
secretName: storage-broker-tls
|
||||
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.us-west-2.aws.neon.tech"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: us-west-2
|
||||
zenith_region_slug: us-west-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
@@ -1,53 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx-internal
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
- host: storage-broker-eta.us-west-2.aws.neon.tech
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- storage-broker-eta.us-west-2.aws.neon.tech
|
||||
secretName: storage-broker-tls
|
||||
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: alb
|
||||
alb.ingress.kubernetes.io/healthcheck-path: /status
|
||||
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
|
||||
alb.ingress.kubernetes.io/scheme: "internal"
|
||||
alb.ingress.kubernetes.io/target-type: "ip"
|
||||
alb.ingress.kubernetes.io/ssl-redirect: "443"
|
||||
alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
|
||||
|
||||
hosts:
|
||||
- host: storage-broker.neon.tech
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: staging
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: alb
|
||||
alb.ingress.kubernetes.io/healthcheck-path: /status
|
||||
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
|
||||
alb.ingress.kubernetes.io/scheme: "internal"
|
||||
alb.ingress.kubernetes.io/target-type: "ip"
|
||||
alb.ingress.kubernetes.io/ssl-redirect: "443"
|
||||
alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
|
||||
|
||||
hosts:
|
||||
- host: storage-broker.stage.neon.tech
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
131
.github/workflows/benchmarking.yml
vendored
131
.github/workflows/benchmarking.yml
vendored
@@ -110,14 +110,8 @@ jobs:
|
||||
rm -rf perf-report-staging
|
||||
mkdir -p perf-report-staging
|
||||
# Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file,
|
||||
# it's important for test_perf_pgbench.py::test_pgbench_remote_* tests.
|
||||
# Do not run tests from test_runner/performance/test_perf_olap.py because they require a prepared DB. We run them separately in `clickbench-compare` job.
|
||||
./scripts/pytest test_runner/performance/ -v \
|
||||
-m "remote_cluster" \
|
||||
--sparse-ordering \
|
||||
--out-dir perf-report-staging \
|
||||
--timeout 5400 \
|
||||
--ignore test_runner/performance/test_perf_olap.py
|
||||
# it's important for test_perf_pgbench.py::test_pgbench_remote_* tests
|
||||
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --out-dir perf-report-staging --timeout 5400
|
||||
|
||||
- name: Submit result
|
||||
env:
|
||||
@@ -213,7 +207,7 @@ jobs:
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
rds-aurora)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }}
|
||||
;;
|
||||
rds-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
|
||||
@@ -231,11 +225,8 @@ jobs:
|
||||
- name: Set database options
|
||||
if: matrix.platform == 'neon-captest-prefetch'
|
||||
run: |
|
||||
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
|
||||
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET enable_seqscan_prefetch=on"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET seqscan_prefetch_buffers=10"
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
@@ -301,115 +292,3 @@ jobs:
|
||||
slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
clickbench-compare:
|
||||
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
|
||||
# we use for performance testing in pgbench-compare.
|
||||
# Run this job only when pgbench-compare is finished to avoid the intersection.
|
||||
# We might change it after https://github.com/neondatabase/neon/issues/2900.
|
||||
#
|
||||
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
|
||||
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
|
||||
if: success() || failure()
|
||||
needs: [ pgbench-compare ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# neon-captest-prefetch: We have pre-created projects with prefetch enabled
|
||||
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
|
||||
options: --init
|
||||
|
||||
timeout-minutes: 360 # 6h
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-prefetch)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
rds-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
|
||||
- name: Set database options
|
||||
if: matrix.platform == 'neon-captest-prefetch'
|
||||
run: |
|
||||
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
|
||||
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Benchmark clickbench
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_perf_olap.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: success() || failure()
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
162
.github/workflows/build_and_test.yml
vendored
162
.github/workflows/build_and_test.yml
vendored
@@ -100,11 +100,11 @@ jobs:
|
||||
run: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FEATURES=""
|
||||
CARGO_FEATURES="--features testing"
|
||||
CARGO_FLAGS="--locked --timings $CARGO_FEATURES"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FEATURES="--features profiling"
|
||||
CARGO_FEATURES="--features testing,profiling"
|
||||
CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES"
|
||||
fi
|
||||
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
|
||||
@@ -539,9 +539,9 @@ jobs:
|
||||
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
|
||||
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
|
||||
# Regular pageserver version string looks like
|
||||
# Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c features: []
|
||||
# Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: []
|
||||
# Bad versions might loop like:
|
||||
# Neon page server git-env:local features: [""]
|
||||
# Neon page server git-env:local failpoints: true, features: ["testing"]
|
||||
# Ensure that we don't have bad versions.
|
||||
- name: Verify image versions
|
||||
shell: bash # ensure no set -e for better error messages
|
||||
@@ -555,6 +555,11 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! echo "$pageserver_version" | grep -qv '"testing"' ; then
|
||||
echo "Pageserver version should have no testing feature enabled"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify docker-compose example
|
||||
run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
|
||||
|
||||
@@ -663,11 +668,11 @@ jobs:
|
||||
- id: set-matrix
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
|
||||
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "storage_broker_ns": "neon-stress-storage-broker", "storage_broker_config": "neon-stress.neon-storage-broker", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY", storage_broker_config: }'
|
||||
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
|
||||
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
|
||||
echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
|
||||
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
|
||||
echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
@@ -727,7 +732,7 @@ jobs:
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-new:
|
||||
@@ -765,7 +770,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-pr-test-new:
|
||||
@@ -798,7 +803,7 @@ jobs:
|
||||
./get_binaries.sh
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-prod-new:
|
||||
@@ -838,7 +843,7 @@ jobs:
|
||||
fi
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-proxy:
|
||||
@@ -880,49 +885,8 @@ jobs:
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
deploy-storage-broker-staging:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
|
||||
env:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Add curl
|
||||
run: apt update && apt install curl -y
|
||||
|
||||
- name: Store kubeconfig file
|
||||
run: |
|
||||
echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
|
||||
- name: Setup helm v3
|
||||
run: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
deploy-proxy-new:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
@@ -941,11 +905,9 @@ jobs:
|
||||
- target_region: us-east-2
|
||||
target_cluster: dev-us-east-2-beta
|
||||
deploy_link_proxy: true
|
||||
deploy_legacy_scram_proxy: true
|
||||
- target_region: eu-west-1
|
||||
target_cluster: dev-eu-west-1-zeta
|
||||
deploy_link_proxy: false
|
||||
deploy_legacy_scram_proxy: false
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -961,54 +923,13 @@ jobs:
|
||||
- name: Re-deploy scram proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
- name: Re-deploy link proxy
|
||||
if: matrix.deploy_link_proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
- name: Re-deploy legacy scram proxy
|
||||
if: matrix.deploy_legacy_scram_proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
deploy-storage-broker-dev-new:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'main') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: dev-us-east-2-beta
|
||||
- target_region: eu-west-1
|
||||
target_cluster: dev-eu-west-1-zeta
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
deploy-proxy-prod-new:
|
||||
runs-on: prod
|
||||
@@ -1026,8 +947,6 @@ jobs:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: prod-us-east-2-delta
|
||||
- target_region: us-west-2
|
||||
target_cluster: prod-us-west-2-eta
|
||||
- target_region: eu-central-1
|
||||
target_cluster: prod-eu-central-1-gamma
|
||||
- target_region: ap-southeast-1
|
||||
@@ -1047,46 +966,7 @@ jobs:
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
deploy-storage-broker-prod-new:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: prod-us-east-2-delta
|
||||
- target_region: us-west-2
|
||||
target_cluster: prod-us-west-2-eta
|
||||
- target_region: eu-central-1
|
||||
target_cluster: prod-eu-central-1-gamma
|
||||
- target_region: ap-southeast-1
|
||||
target_cluster: prod-ap-southeast-1-epsilon
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
promote-compatibility-data:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
|
||||
787
Cargo.lock
generated
787
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
29
README.md
29
README.md
@@ -2,20 +2,29 @@
|
||||
|
||||
Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
|
||||
|
||||
The project used to be called "Zenith". Many of the commands and code comments
|
||||
still refer to "zenith", but we are in the process of renaming things.
|
||||
|
||||
## Quick start
|
||||
Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
|
||||
[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor.
|
||||
|
||||
Alternatively, compile and run the project [locally](#running-local-installation).
|
||||
|
||||
## Architecture overview
|
||||
|
||||
A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
|
||||
A Neon installation consists of compute nodes and a Neon storage engine.
|
||||
|
||||
Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
|
||||
|
||||
The Neon storage engine consists of two major components:
|
||||
- Pageserver. Scalable storage backend for the compute nodes.
|
||||
- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
|
||||
- WAL service. The service receives WAL from the compute node and ensures that it is stored durably.
|
||||
|
||||
See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information.
|
||||
Pageserver consists of:
|
||||
- Repository - Neon storage implementation.
|
||||
- WAL receiver - service that receives WAL from WAL service and stores it in the repository.
|
||||
- Page service - service that communicates with compute nodes and responds with pages from the repository.
|
||||
- WAL redo - service that builds pages from base images and WAL records on Page service request
|
||||
|
||||
## Running local installation
|
||||
|
||||
@@ -213,27 +222,19 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
|
||||
make
|
||||
CARGO_BUILD_FLAGS="--features=testing" make
|
||||
|
||||
./scripts/pytest
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
[/docs/](/docs/) Contains a top-level overview of all available markdown documentation.
|
||||
Now we use README files to cover design ideas and overall architecture for each module and `rustdoc` style documentation comments. See also [/docs/](/docs/) a top-level overview of all available markdown documentation.
|
||||
|
||||
- [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.
|
||||
|
||||
To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`
|
||||
|
||||
See also README files in some source directories, and `rustdoc` style documentation comments.
|
||||
|
||||
Other resources:
|
||||
|
||||
- [SELECT 'Hello, World'](https://neon.tech/blog/hello-world/): Blog post by Nikita Shamgunov on the high level architecture
|
||||
- [Architecture decisions in Neon](https://neon.tech/blog/architecture-decisions-in-neon/): Blog post by Heikki Linnakangas
|
||||
- [Neon: Serverless PostgreSQL!](https://www.youtube.com/watch?v=rES0yzeERns): Presentation on storage system by Heikki Linnakangas in the CMU Database Group seminar series
|
||||
|
||||
### Postgres-specific terms
|
||||
|
||||
Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used.
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
chrono = "0.4"
|
||||
clap = "4.0"
|
||||
env_logger = "0.9"
|
||||
futures = "0.3.13"
|
||||
|
||||
@@ -14,19 +14,17 @@
|
||||
|
||||
use std::ffi::OsStr;
|
||||
use std::io::Write;
|
||||
use std::os::unix::prelude::AsRawFd;
|
||||
use std::os::unix::process::CommandExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::Path;
|
||||
use std::process::{Child, Command};
|
||||
use std::time::Duration;
|
||||
use std::{fs, io, thread};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use nix::errno::Errno;
|
||||
use nix::fcntl::{FcntlArg, FdFlag};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use utils::pid_file::{self, PidFileRead};
|
||||
|
||||
use utils::lock_file;
|
||||
|
||||
// These constants control the loop used to poll for process start / stop.
|
||||
//
|
||||
@@ -88,14 +86,6 @@ where
|
||||
let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
|
||||
filled_cmd.envs(envs);
|
||||
|
||||
let pid_file_to_check = match initial_pid_file {
|
||||
InitialPidFile::Create(path) => {
|
||||
pre_exec_create_pidfile(filled_cmd, path);
|
||||
path
|
||||
}
|
||||
InitialPidFile::Expect(path) => path,
|
||||
};
|
||||
|
||||
let mut spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
format!("Could not spawn {process_name}, see console output and log files for details.")
|
||||
})?;
|
||||
@@ -105,8 +95,29 @@ where
|
||||
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
|
||||
);
|
||||
|
||||
let pid_file_to_check = match initial_pid_file {
|
||||
InitialPidFile::Create(target_pid_file_path) => {
|
||||
match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
|
||||
lock_file::LockCreationResult::Created { .. } => {
|
||||
// We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
|
||||
// as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
|
||||
}
|
||||
lock_file::LockCreationResult::AlreadyLocked { .. } => {
|
||||
anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
|
||||
}
|
||||
lock_file::LockCreationResult::CreationFailed(e) => {
|
||||
return Err(e.context(format!(
|
||||
"Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
|
||||
};
|
||||
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, Some(pid_file_to_check), &process_status_check) {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started, pid: {pid}");
|
||||
return Ok(spawned_process);
|
||||
@@ -136,45 +147,14 @@ where
|
||||
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
|
||||
}
|
||||
|
||||
/// Send SIGTERM to child process
|
||||
pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()> {
|
||||
let pid = child.id();
|
||||
match kill(
|
||||
nix::unistd::Pid::from_raw(pid.try_into().unwrap()),
|
||||
Signal::SIGTERM,
|
||||
) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!("child process with pid {pid} does not exist");
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => anyhow::bail!("Failed to send signal to child process with pid {pid}: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
|
||||
let pid = match pid_file::read(pid_file)
|
||||
.with_context(|| format!("read pid_file {pid_file:?}"))?
|
||||
{
|
||||
PidFileRead::NotExist => {
|
||||
println!("{process_name} is already stopped: no pid file present at {pid_file:?}");
|
||||
return Ok(());
|
||||
}
|
||||
PidFileRead::NotHeldByAnyProcess(_) => {
|
||||
// Don't try to kill according to file contents beacuse the pid might have been re-used by another process.
|
||||
// Don't delete the file either, it can race with new pid file creation.
|
||||
// Read `pid_file` module comment for details.
|
||||
println!(
|
||||
"No process is holding the pidfile. The process must have already exited. Leave in place to avoid race conditions: {pid_file:?}"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
PidFileRead::LockedByOtherProcess(pid) => pid,
|
||||
};
|
||||
// XXX the pid could become invalid (and recycled) at any time before the kill() below.
|
||||
if !pid_file.exists() {
|
||||
println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
|
||||
return Ok(());
|
||||
}
|
||||
let pid = read_pidfile(pid_file)?;
|
||||
|
||||
// send signal
|
||||
let sig = if immediate {
|
||||
print!("Stopping {process_name} with pid {pid} immediately..");
|
||||
Signal::SIGQUIT
|
||||
@@ -186,9 +166,8 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
|
||||
match kill(pid, sig) {
|
||||
Ok(()) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
// Again, don't delete the pid file. The unlink can race with a new pid file being created.
|
||||
println!(
|
||||
"{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found. Likely the pid got recycled. Lucky we didn't harm anyone."
|
||||
"{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
@@ -200,6 +179,11 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
|
||||
match process_has_stopped(pid) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} stopped");
|
||||
if let Err(e) = fs::remove_file(pid_file) {
|
||||
if e.kind() != io::ErrorKind::NotFound {
|
||||
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
Ok(false) => {
|
||||
@@ -225,14 +209,7 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
|
||||
}
|
||||
|
||||
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
// If RUST_BACKTRACE is set, pass it through. But if it's not set, default
|
||||
// to RUST_BACKTRACE=1.
|
||||
let backtrace_setting = std::env::var_os("RUST_BACKTRACE");
|
||||
let backtrace_setting = backtrace_setting
|
||||
.as_deref()
|
||||
.unwrap_or_else(|| OsStr::new("1"));
|
||||
|
||||
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting);
|
||||
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
|
||||
|
||||
// Pass through these environment variables to the command
|
||||
for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
|
||||
@@ -257,69 +234,6 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
cmd
|
||||
}
|
||||
|
||||
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
|
||||
/// 1. Claims a pidfile with a fcntl lock on it and
|
||||
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
|
||||
/// will remain held until the cmd exits.
|
||||
fn pre_exec_create_pidfile<P>(cmd: &mut Command, path: P) -> &mut Command
|
||||
where
|
||||
P: Into<PathBuf>,
|
||||
{
|
||||
let path: PathBuf = path.into();
|
||||
// SAFETY
|
||||
// pre_exec is marked unsafe because it runs between fork and exec.
|
||||
// Why is that dangerous in various ways?
|
||||
// Long answer: https://github.com/rust-lang/rust/issues/39575
|
||||
// Short answer: in a multi-threaded program, other threads may have
|
||||
// been inside of critical sections at the time of fork. In the
|
||||
// original process, that was allright, assuming they protected
|
||||
// the critical sections appropriately, e.g., through locks.
|
||||
// Fork adds another process to the mix that
|
||||
// 1. Has a single thread T
|
||||
// 2. In an exact copy of the address space at the time of fork.
|
||||
// A variety of problems scan occur now:
|
||||
// 1. T tries to grab a lock that was locked at the time of fork.
|
||||
// It will wait forever since in its address space, the lock
|
||||
// is in state 'taken' but the thread that would unlock it is
|
||||
// not there.
|
||||
// 2. A rust object that represented some external resource in the
|
||||
// parent now got implicitly copied by the the fork, even though
|
||||
// the object's type is not `Copy`. The parent program may use
|
||||
// non-copyability as way to enforce unique ownership of an
|
||||
// external resource in the typesystem. The fork breaks that
|
||||
// assumption, as now both parent and child process have an
|
||||
// owned instance of the object that represents the same
|
||||
// underlying resource.
|
||||
// While these seem like niche problems, (1) in particular is
|
||||
// highly relevant. For example, `malloc()` may grab a mutex internally,
|
||||
// and so, if we forked while another thread was mallocing' and our
|
||||
// pre_exec closure allocates as well, it will block on the malloc
|
||||
// mutex forever
|
||||
//
|
||||
// The proper solution is to only use C library functions that are marked
|
||||
// "async-signal-safe": https://man7.org/linux/man-pages/man7/signal-safety.7.html
|
||||
//
|
||||
// With this specific pre_exec() closure, the non-error path doesn't allocate.
|
||||
// The error path uses `anyhow`, and hence does allocate.
|
||||
// We take our chances there, hoping that any potential disaster is constrained
|
||||
// to the child process (e.g., malloc has no state ourside of the child process).
|
||||
// Last, `expect` prints to stderr, and stdio is not async-signal-safe.
|
||||
// Again, we take our chances, making the same assumptions as for malloc.
|
||||
unsafe {
|
||||
cmd.pre_exec(move || {
|
||||
let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
|
||||
// Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
|
||||
// remains locked after exec.
|
||||
nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
|
||||
.expect("remove FD_CLOEXEC");
|
||||
// Don't run drop(file), it would close the file before we actually exec.
|
||||
std::mem::forget(file);
|
||||
Ok(())
|
||||
});
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
fn process_started<F>(
|
||||
pid: Pid,
|
||||
pid_file_to_check: Option<&Path>,
|
||||
@@ -330,11 +244,14 @@ where
|
||||
{
|
||||
match status_check() {
|
||||
Ok(true) => match pid_file_to_check {
|
||||
Some(pid_file_path) => match pid_file::read(pid_file_path)? {
|
||||
PidFileRead::NotExist => Ok(false),
|
||||
PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
|
||||
PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
|
||||
},
|
||||
Some(pid_file_path) => {
|
||||
if pid_file_path.exists() {
|
||||
let pid_in_file = read_pidfile(pid_file_path)?;
|
||||
Ok(pid_in_file == pid)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
None => Ok(true),
|
||||
},
|
||||
Ok(false) => Ok(false),
|
||||
@@ -342,6 +259,21 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// We expect a file that contains a single integer.
|
||||
fn read_pidfile(pidfile: &Path) -> Result<Pid> {
|
||||
let pid_str = fs::read_to_string(pidfile)
|
||||
.with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
|
||||
let pid: i32 = pid_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
|
||||
if pid < 1 {
|
||||
bail!("pidfile {pidfile:?} contained bad value '{pid}'");
|
||||
}
|
||||
Ok(Pid::from_raw(pid))
|
||||
}
|
||||
|
||||
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
match kill(pid, None) {
|
||||
// Process exists, keep waiting
|
||||
|
||||
@@ -324,7 +324,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
pg_version,
|
||||
)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("pageserver init failed: {e:?}");
|
||||
eprintln!("pageserver init failed: {e}");
|
||||
exit(1);
|
||||
});
|
||||
|
||||
|
||||
@@ -156,8 +156,6 @@ pub struct PageServerConf {
|
||||
|
||||
// jwt auth token used for communication with pageserver
|
||||
pub auth_token: String,
|
||||
|
||||
pub testing_mode: bool,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -168,7 +166,6 @@ impl Default for PageServerConf {
|
||||
listen_http_addr: String::new(),
|
||||
auth_type: AuthType::Trust,
|
||||
auth_token: String::new(),
|
||||
testing_mode: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{BufReader, Write};
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Child;
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use anyhow::{bail, Context};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
@@ -141,9 +141,6 @@ impl PageServerNode {
|
||||
init_config_overrides.push(&listen_http_addr_param);
|
||||
init_config_overrides.push(&listen_pg_addr_param);
|
||||
init_config_overrides.push(&broker_endpoints_param);
|
||||
if self.env.pageserver.testing_mode {
|
||||
init_config_overrides.push("testing_mode=true");
|
||||
}
|
||||
|
||||
if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
|
||||
init_config_overrides.push(broker_etcd_prefix_param);
|
||||
@@ -171,21 +168,29 @@ impl PageServerNode {
|
||||
}
|
||||
Err(e) => eprintln!("{e:#}"),
|
||||
}
|
||||
background_process::send_stop_child_process(&pageserver_process)?;
|
||||
|
||||
let exit_code = pageserver_process.wait()?;
|
||||
ensure!(
|
||||
exit_code.success(),
|
||||
format!(
|
||||
"pageserver init failed with exit code {:?}",
|
||||
exit_code.code()
|
||||
)
|
||||
);
|
||||
println!(
|
||||
"Stopped pageserver {} process with pid {}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
);
|
||||
match pageserver_process.kill() {
|
||||
Err(e) => {
|
||||
eprintln!(
|
||||
"Failed to stop pageserver {} process with pid {}: {e:#}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
)
|
||||
}
|
||||
Ok(()) => {
|
||||
println!(
|
||||
"Stopped pageserver {} process with pid {}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
);
|
||||
// cleanup after pageserver startup, since we do not call regular `stop_process` during init
|
||||
let pid_file = self.pid_file();
|
||||
if let Err(e) = fs::remove_file(&pid_file) {
|
||||
if e.kind() != io::ErrorKind::NotFound {
|
||||
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
init_result
|
||||
}
|
||||
|
||||
|
||||
@@ -45,9 +45,9 @@ and create new databases and accounts (control plane API in our case).
|
||||
|
||||
Integration tests, written in Python using the `pytest` framework.
|
||||
|
||||
`/vendor/postgres-v14` and `/vendor/postgres-v15`:
|
||||
`/vendor/postgres-v14`:
|
||||
|
||||
PostgreSQL source tree per version, with the modifications needed for Neon.
|
||||
PostgreSQL source tree, with the modifications needed for Neon.
|
||||
|
||||
`/pgxn/neon`:
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ pub enum TenantState {
|
||||
Active,
|
||||
/// A tenant is recognized by pageserver, but it is being detached or the
|
||||
/// system is being shut down.
|
||||
Stopping,
|
||||
Paused,
|
||||
/// A tenant is recognized by the pageserver, but can no longer be used for
|
||||
/// any operations, because it failed to be activated.
|
||||
Broken,
|
||||
@@ -35,7 +35,7 @@ impl TenantState {
|
||||
Self::Loading => true,
|
||||
Self::Attaching => true,
|
||||
Self::Active => false,
|
||||
Self::Stopping => false,
|
||||
Self::Paused => false,
|
||||
Self::Broken => false,
|
||||
}
|
||||
}
|
||||
@@ -53,7 +53,7 @@ pub enum TimelineState {
|
||||
Suspended,
|
||||
/// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
|
||||
/// automatically become Active after certain events: only a management call can change this status.
|
||||
Stopping,
|
||||
Paused,
|
||||
/// A timeline is recognized by the pageserver, but can no longer be used for
|
||||
/// any operations, because it failed to be activated.
|
||||
Broken,
|
||||
@@ -201,6 +201,8 @@ pub struct TimelineInfo {
|
||||
pub last_received_msg_ts: Option<u128>,
|
||||
pub pg_version: u32,
|
||||
|
||||
pub awaits_download: bool,
|
||||
|
||||
pub state: TimelineState,
|
||||
|
||||
// Some of the above fields are duplicated in 'local' and 'remote', for backwards-
|
||||
|
||||
@@ -9,11 +9,8 @@ async-trait = "0.1"
|
||||
metrics = { version = "0.1", path = "../metrics" }
|
||||
utils = { version = "0.1", path = "../utils" }
|
||||
once_cell = "1.13.0"
|
||||
aws-smithy-http = "0.51.0"
|
||||
aws-types = "0.51.0"
|
||||
aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "0.21.0"
|
||||
hyper = { version = "0.14", features = ["stream"] }
|
||||
rusoto_core = "0.48"
|
||||
rusoto_s3 = "0.48"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
||||
|
||||
@@ -10,7 +10,7 @@ mod s3_bucket;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fmt::Debug,
|
||||
fmt::{Debug, Display},
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
ops::Deref,
|
||||
path::{Path, PathBuf},
|
||||
@@ -41,27 +41,44 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
|
||||
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
|
||||
/// Path on the remote storage, relative to some inner prefix.
|
||||
/// The prefix is an implementation detail, that allows representing local paths
|
||||
/// as the remote ones, stripping the local storage prefix away.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct RemotePath(PathBuf);
|
||||
|
||||
impl RemotePath {
|
||||
pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
|
||||
anyhow::ensure!(
|
||||
relative_path.is_relative(),
|
||||
"Path {relative_path:?} is not relative"
|
||||
);
|
||||
Ok(Self(relative_path.to_path_buf()))
|
||||
}
|
||||
|
||||
pub fn with_base(&self, base_path: &Path) -> PathBuf {
|
||||
base_path.join(&self.0)
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct RemoteObjectId(String);
|
||||
|
||||
///
|
||||
/// A key that refers to an object in remote storage. It works much like a Path,
|
||||
/// but it's a separate datatype so that you don't accidentally mix local paths
|
||||
/// and remote keys.
|
||||
///
|
||||
impl RemoteObjectId {
|
||||
// Needed to retrieve last component for RemoteObjectId.
|
||||
// In other words a file name
|
||||
/// Turn a/b/c or a/b/c/ into c
|
||||
pub fn object_name(&self) -> Option<&str> {
|
||||
self.0.file_name().and_then(|os_str| os_str.to_str())
|
||||
// corner case, char::to_string is not const, thats why this is more verbose than it needs to be
|
||||
// see https://github.com/rust-lang/rust/issues/88674
|
||||
if self.0.len() == 1 && self.0.chars().next().unwrap() == REMOTE_STORAGE_PREFIX_SEPARATOR {
|
||||
return None;
|
||||
}
|
||||
|
||||
if self.0.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
self.0.rsplit(REMOTE_STORAGE_PREFIX_SEPARATOR).nth(1)
|
||||
} else {
|
||||
self.0
|
||||
.rsplit_once(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
.map(|(_, last)| last)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for RemoteObjectId {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
Debug::fmt(&self.0, fmt)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for RemoteObjectId {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
Display::fmt(&self.0, fmt)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,40 +87,49 @@ impl RemotePath {
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId>;
|
||||
|
||||
/// Gets the download path of the given storage file.
|
||||
fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf>;
|
||||
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>>;
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>>;
|
||||
|
||||
/// Lists all top level subdirectories for a given prefix
|
||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
||||
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
|
||||
/// so this method doesnt need to.
|
||||
async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemoteObjectId>,
|
||||
) -> anyhow::Result<Vec<RemoteObjectId>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
|
||||
from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
from_size_bytes: usize,
|
||||
to: &RemoteObjectId,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError>;
|
||||
async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError>;
|
||||
|
||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
from: &RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
) -> Result<Download, DownloadError>;
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
||||
async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()>;
|
||||
|
||||
/// Downcast to LocalFs implementation. For tests.
|
||||
fn as_local(&self) -> Option<&LocalFs> {
|
||||
@@ -152,35 +178,34 @@ impl std::error::Error for DownloadError {}
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
#[derive(Clone)]
|
||||
pub enum GenericRemoteStorage {
|
||||
LocalFs(LocalFs),
|
||||
AwsS3(Arc<S3Bucket>),
|
||||
}
|
||||
pub struct GenericRemoteStorage(Arc<dyn RemoteStorage>);
|
||||
|
||||
impl Deref for GenericRemoteStorage {
|
||||
type Target = dyn RemoteStorage;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match self {
|
||||
GenericRemoteStorage::LocalFs(local_fs) => local_fs,
|
||||
GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
|
||||
}
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub fn new(storage: impl RemoteStorage) -> Self {
|
||||
Self(Arc::new(storage))
|
||||
}
|
||||
|
||||
pub fn from_config(
|
||||
working_directory: PathBuf,
|
||||
storage_config: &RemoteStorageConfig,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
Ok(match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => {
|
||||
info!("Using fs root '{}' as a remote storage", root.display());
|
||||
GenericRemoteStorage::LocalFs(LocalFs::new(root.clone())?)
|
||||
GenericRemoteStorage::new(LocalFs::new(root.clone(), working_directory)?)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
|
||||
GenericRemoteStorage::new(S3Bucket::new(s3_config, working_directory)?)
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -194,12 +219,23 @@ impl GenericRemoteStorage {
|
||||
&self,
|
||||
from: Box<dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static>,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
from_path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
self.upload(from, from_size_bytes, to, None)
|
||||
let target_storage_path = self.remote_object_id(from_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the storage path for source local path '{}'",
|
||||
from_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
self.upload(from, from_size_bytes, &target_storage_path, None)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}")
|
||||
format!(
|
||||
"Failed to upload from '{}' to storage path '{:?}'",
|
||||
from_path.display(),
|
||||
target_storage_path
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -208,11 +244,24 @@ impl GenericRemoteStorage {
|
||||
pub async fn download_storage_object(
|
||||
&self,
|
||||
byte_range: Option<(u64, Option<u64>)>,
|
||||
from: &RemotePath,
|
||||
to_path: &Path,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let remote_object_path = self
|
||||
.remote_object_id(to_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the storage path for target local path '{}'",
|
||||
to_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
|
||||
match byte_range {
|
||||
Some((start, end)) => self.download_byte_range(from, start, end).await,
|
||||
None => self.download(from).await,
|
||||
Some((start, end)) => {
|
||||
self.download_byte_range(&remote_object_path, start, end)
|
||||
.await
|
||||
}
|
||||
None => self.download(&remote_object_path).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -222,6 +271,23 @@ impl GenericRemoteStorage {
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct StorageMetadata(HashMap<String, String>);
|
||||
|
||||
fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
|
||||
if prefix == path {
|
||||
anyhow::bail!(
|
||||
"Prefix and the path are equal, cannot strip: '{}'",
|
||||
prefix.display()
|
||||
)
|
||||
} else {
|
||||
path.strip_prefix(prefix).with_context(|| {
|
||||
format!(
|
||||
"Path '{}' is not prefixed with '{}'",
|
||||
path.display(),
|
||||
prefix.display(),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
@@ -365,24 +431,21 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_object_name() {
|
||||
let k = RemotePath::new(Path::new("a/b/c")).unwrap();
|
||||
fn object_name() {
|
||||
let k = RemoteObjectId("a/b/c".to_owned());
|
||||
assert_eq!(k.object_name(), Some("c"));
|
||||
|
||||
let k = RemotePath::new(Path::new("a/b/c/")).unwrap();
|
||||
let k = RemoteObjectId("a/b/c/".to_owned());
|
||||
assert_eq!(k.object_name(), Some("c"));
|
||||
|
||||
let k = RemotePath::new(Path::new("a/")).unwrap();
|
||||
let k = RemoteObjectId("a/".to_owned());
|
||||
assert_eq!(k.object_name(), Some("a"));
|
||||
|
||||
// XXX is it impossible to have an empty key?
|
||||
let k = RemotePath::new(Path::new("")).unwrap();
|
||||
let k = RemoteObjectId("".to_owned());
|
||||
assert_eq!(k.object_name(), None);
|
||||
|
||||
let k = RemoteObjectId("/".to_owned());
|
||||
assert_eq!(k.object_name(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rempte_path_cannot_be_created_from_absolute_ones() {
|
||||
let err = RemotePath::new(Path::new("/")).expect_err("Should fail on absolute paths");
|
||||
assert_eq!(err.to_string(), "Path \"/\" is not relative");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
future::Future,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
@@ -19,33 +18,60 @@ use tokio::{
|
||||
use tracing::*;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
|
||||
use crate::{Download, DownloadError, RemotePath};
|
||||
use crate::{Download, DownloadError, RemoteObjectId};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
|
||||
|
||||
const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
/// Convert a Path in the remote storage into a RemoteObjectId
|
||||
fn remote_object_id_from_path(path: &Path) -> anyhow::Result<RemoteObjectId> {
|
||||
Ok(RemoteObjectId(
|
||||
path.to_str()
|
||||
.ok_or_else(|| anyhow::anyhow!("unexpected characters found in path"))?
|
||||
.to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
pub struct LocalFs {
|
||||
working_directory: PathBuf,
|
||||
storage_root: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalFs {
|
||||
/// Attempts to create local FS storage, along with its root directory.
|
||||
/// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
|
||||
pub fn new(mut storage_root: PathBuf) -> anyhow::Result<Self> {
|
||||
if !storage_root.exists() {
|
||||
std::fs::create_dir_all(&storage_root).with_context(|| {
|
||||
format!("Failed to create all directories in the given root path {storage_root:?}")
|
||||
})?;
|
||||
}
|
||||
if !storage_root.is_absolute() {
|
||||
storage_root = storage_root.canonicalize().with_context(|| {
|
||||
format!("Failed to represent path {storage_root:?} as an absolute path")
|
||||
pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result<Self> {
|
||||
if !root.exists() {
|
||||
std::fs::create_dir_all(&root).with_context(|| {
|
||||
format!(
|
||||
"Failed to create all directories in the given root path '{}'",
|
||||
root.display(),
|
||||
)
|
||||
})?;
|
||||
}
|
||||
Ok(Self {
|
||||
working_directory,
|
||||
storage_root: root,
|
||||
})
|
||||
}
|
||||
|
||||
Ok(Self { storage_root })
|
||||
///
|
||||
/// Get the absolute path in the local filesystem to given remote object.
|
||||
///
|
||||
/// This is public so that it can be used in tests. Should not be used elsewhere.
|
||||
///
|
||||
pub fn resolve_in_storage(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf> {
|
||||
let path = PathBuf::from(&remote_object_id.0);
|
||||
if path.is_relative() {
|
||||
Ok(self.storage_root.join(path))
|
||||
} else if path.starts_with(&self.storage_root) {
|
||||
Ok(path)
|
||||
} else {
|
||||
bail!(
|
||||
"Path '{}' does not belong to the current storage",
|
||||
path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_storage_metadata(
|
||||
@@ -77,48 +103,45 @@ impl LocalFs {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
Ok(get_all_files(&self.storage_root, true)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|path| {
|
||||
path.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip storage root prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
)
|
||||
})
|
||||
.collect())
|
||||
/// Convert a "local" path into a "remote path"
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId> {
|
||||
let path = self.storage_root.join(
|
||||
strip_path_prefix(&self.working_directory, local_path)
|
||||
.context("local path does not belong to this storage")?,
|
||||
);
|
||||
remote_object_id_from_path(&path)
|
||||
}
|
||||
|
||||
async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf> {
|
||||
let storage_path = PathBuf::from(&remote_object_id.0);
|
||||
let relative_path = strip_path_prefix(&self.storage_root, &storage_path)
|
||||
.context("local path does not belong to this storage")?;
|
||||
Ok(self.working_directory.join(relative_path))
|
||||
}
|
||||
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>> {
|
||||
get_all_files(&self.storage_root, true).await
|
||||
}
|
||||
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemoteObjectId>,
|
||||
) -> anyhow::Result<Vec<RemoteObjectId>> {
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
Some(prefix) => Path::new(&prefix.0),
|
||||
None => &self.storage_root,
|
||||
};
|
||||
Ok(get_all_files(path.as_ref(), false)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|path| {
|
||||
path.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip preifix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
)
|
||||
})
|
||||
.collect())
|
||||
get_all_files(path, false).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
|
||||
from_size_bytes: usize,
|
||||
to: &RemoteObjectId,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let target_file_path = to.with_base(&self.storage_root);
|
||||
let target_file_path = self.resolve_in_storage(to)?;
|
||||
create_target_directory(&target_file_path).await?;
|
||||
// We need this dance with sort of durable rename (without fsyncs)
|
||||
// to prevent partial uploads. This was really hit when pageserver shutdown
|
||||
@@ -139,8 +162,8 @@ impl RemoteStorage for LocalFs {
|
||||
})?,
|
||||
);
|
||||
|
||||
let from_size_bytes = data_size_bytes as u64;
|
||||
let mut buffer_to_read = data.take(from_size_bytes);
|
||||
let from_size_bytes = from_size_bytes as u64;
|
||||
let mut buffer_to_read = from.take(from_size_bytes);
|
||||
|
||||
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
|
||||
.await
|
||||
@@ -197,22 +220,27 @@ impl RemoteStorage for LocalFs {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError> {
|
||||
let file_path = self
|
||||
.resolve_in_storage(from)
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
if file_exists(&file_path).map_err(DownloadError::BadInput)? {
|
||||
let source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.open(&file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
format!(
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.read_storage_metadata(&file_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok(Download {
|
||||
@@ -226,7 +254,7 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
from: &RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
) -> Result<Download, DownloadError> {
|
||||
@@ -238,15 +266,20 @@ impl RemoteStorage for LocalFs {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
|
||||
}
|
||||
}
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let file_path = self
|
||||
.resolve_in_storage(from)
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
if file_exists(&file_path).map_err(DownloadError::BadInput)? {
|
||||
let mut source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.open(&file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
format!(
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
@@ -256,7 +289,7 @@ impl RemoteStorage for LocalFs {
|
||||
.context("Failed to seek to the range start in a local storage file")
|
||||
.map_err(DownloadError::Other)?;
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.read_storage_metadata(&file_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -275,12 +308,15 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let file_path = path.with_base(&self.storage_root);
|
||||
async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(path)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
Ok(fs::remove_file(file_path).await?)
|
||||
} else {
|
||||
bail!("File {file_path:?} either does not exist or is not a file")
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -296,7 +332,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf {
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
recursive: bool,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<RemoteObjectId>>> + Send + Sync + 'a>>
|
||||
where
|
||||
P: AsRef<Path> + Send + Sync + 'a,
|
||||
{
|
||||
@@ -310,20 +346,20 @@ where
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path = dir_entry.path();
|
||||
if file_type.is_symlink() {
|
||||
debug!("{entry_path:?} us a symlink, skipping")
|
||||
debug!("{:?} us a symlink, skipping", entry_path)
|
||||
} else if file_type.is_dir() {
|
||||
if recursive {
|
||||
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
|
||||
} else {
|
||||
paths.push(entry_path)
|
||||
paths.push(remote_object_id_from_path(&dir_entry.path())?)
|
||||
}
|
||||
} else {
|
||||
paths.push(entry_path);
|
||||
paths.push(remote_object_id_from_path(&dir_entry.path())?);
|
||||
}
|
||||
}
|
||||
Ok(paths)
|
||||
} else {
|
||||
bail!("Path {directory_path:?} is not a directory")
|
||||
bail!("Path '{}' is not a directory", directory_path.display())
|
||||
}
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
@@ -358,6 +394,173 @@ fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod pure_tests {
|
||||
use tempfile::tempdir;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn storage_path_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
working_directory: workdir.clone(),
|
||||
storage_root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let local_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("file_name");
|
||||
let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?);
|
||||
|
||||
let actual_path = PathBuf::from(
|
||||
storage
|
||||
.remote_object_id(&local_path)
|
||||
.expect("Matching path should map to storage path normally")
|
||||
.0,
|
||||
);
|
||||
assert_eq!(
|
||||
expected_path,
|
||||
actual_path,
|
||||
"File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
|
||||
match storage.remote_object_id(mismatching_path) {
|
||||
Ok(wrong_path) => panic!(
|
||||
"Expected path '{}' to error, but got storage path: {:?}",
|
||||
mismatching_path.display(),
|
||||
wrong_path,
|
||||
),
|
||||
Err(e) => format!("{:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
working_directory: workdir.clone(),
|
||||
storage_root,
|
||||
};
|
||||
|
||||
let error_string = storage_path_error(&storage, &workdir);
|
||||
assert!(error_string.contains("does not belong to this storage"));
|
||||
assert!(error_string.contains(workdir.to_str().unwrap()));
|
||||
|
||||
let mismatching_path_str = "/something/else";
|
||||
let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
|
||||
assert!(
|
||||
error_message.contains(mismatching_path_str),
|
||||
"Error should mention wrong path"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains(workdir.to_str().unwrap()),
|
||||
"Error should mention server workdir"
|
||||
);
|
||||
assert!(error_message.contains("does not belong to this storage"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_path_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
working_directory: workdir.clone(),
|
||||
storage_root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let name = "not a metadata";
|
||||
let local_path = workdir.join("timelines").join("some_timeline").join(name);
|
||||
assert_eq!(
|
||||
local_path,
|
||||
storage
|
||||
.local_path(&remote_object_id_from_path(
|
||||
&storage_root.join(local_path.strip_prefix(&workdir)?)
|
||||
)?)
|
||||
.expect("For a valid input, valid local path should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta file"
|
||||
);
|
||||
|
||||
let local_metadata_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("metadata");
|
||||
let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?;
|
||||
assert_eq!(
|
||||
local_metadata_path,
|
||||
storage
|
||||
.local_path(&remote_metadata_path)
|
||||
.expect("For a valid input, valid local path should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote metadata file"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn local_path_error(storage: &LocalFs, storage_path: &RemoteObjectId) -> String {
|
||||
match storage.local_path(storage_path) {
|
||||
Ok(wrong_path) => panic!(
|
||||
"Expected local path input {:?} to cause an error, but got file path: {:?}",
|
||||
storage_path, wrong_path,
|
||||
),
|
||||
Err(e) => format!("{:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
working_directory: tempdir()?.path().to_owned(),
|
||||
storage_root,
|
||||
};
|
||||
|
||||
let totally_wrong_path = "wrong_wrong_wrong";
|
||||
let error_message =
|
||||
local_path_error(&storage, &RemoteObjectId(totally_wrong_path.to_string()));
|
||||
assert!(error_message.contains(totally_wrong_path));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_destination_matches_original_path() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let original_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("some name");
|
||||
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let dummy_storage = LocalFs {
|
||||
working_directory: workdir,
|
||||
storage_root,
|
||||
};
|
||||
|
||||
let storage_path = dummy_storage.remote_object_id(&original_path)?;
|
||||
let download_destination = dummy_storage.local_path(&storage_path)?;
|
||||
|
||||
assert_eq!(
|
||||
original_path, download_destination,
|
||||
"'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod fs_tests {
|
||||
use super::*;
|
||||
@@ -369,7 +572,7 @@ mod fs_tests {
|
||||
storage: &LocalFs,
|
||||
#[allow(clippy::ptr_arg)]
|
||||
// have to use &PathBuf due to `storage.local_path` parameter requirements
|
||||
remote_storage_path: &RemotePath,
|
||||
remote_storage_path: &RemoteObjectId,
|
||||
expected_metadata: Option<&StorageMetadata>,
|
||||
) -> anyhow::Result<String> {
|
||||
let mut download = storage
|
||||
@@ -392,16 +595,41 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_file() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage = create_storage()?;
|
||||
|
||||
let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
|
||||
let (file, size) = create_file_for_upload(
|
||||
&storage.working_directory.join("whatever"),
|
||||
"whatever_contents",
|
||||
)
|
||||
.await?;
|
||||
let target_path = "/somewhere/else";
|
||||
match storage
|
||||
.upload(
|
||||
Box::new(file),
|
||||
size,
|
||||
&RemoteObjectId(target_path.to_string()),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => panic!("Should not allow storing files with wrong target path"),
|
||||
Err(e) => {
|
||||
let message = format!("{:?}", e);
|
||||
assert!(message.contains(target_path));
|
||||
assert!(message.contains("does not belong to the current storage"));
|
||||
}
|
||||
}
|
||||
assert!(storage.list().await?.is_empty());
|
||||
|
||||
let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?;
|
||||
assert_eq!(
|
||||
storage.list().await?,
|
||||
vec![target_path_1.clone()],
|
||||
"Should list a single file after first upload"
|
||||
);
|
||||
|
||||
let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?;
|
||||
let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?;
|
||||
assert_eq!(
|
||||
list_files_sorted(&storage).await?,
|
||||
vec![target_path_1.clone(), target_path_2.clone()],
|
||||
@@ -415,7 +643,7 @@ mod fs_tests {
|
||||
async fn upload_file_negatives() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
|
||||
let id = RemotePath::new(Path::new("dummy"))?;
|
||||
let id = storage.remote_object_id(&storage.working_directory.join("dummy"))?;
|
||||
let content = std::io::Cursor::new(b"12345");
|
||||
|
||||
// Check that you get an error if the size parameter doesn't match the actual
|
||||
@@ -440,14 +668,16 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
fn create_storage() -> anyhow::Result<LocalFs> {
|
||||
LocalFs::new(tempdir()?.path().to_owned())
|
||||
LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
|
||||
let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
|
||||
assert_eq!(
|
||||
@@ -457,7 +687,7 @@ mod fs_tests {
|
||||
);
|
||||
|
||||
let non_existing_path = "somewhere/else";
|
||||
match storage.download(&RemotePath::new(Path::new(non_existing_path))?).await {
|
||||
match storage.download(&RemoteObjectId(non_existing_path.to_string())).await {
|
||||
Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
|
||||
other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
|
||||
}
|
||||
@@ -466,9 +696,11 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
|
||||
let full_range_download_contents =
|
||||
read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
|
||||
@@ -534,9 +766,11 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_negative() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
|
||||
let start = 1_000_000_000;
|
||||
let end = start + 1;
|
||||
@@ -578,9 +812,11 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_file() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
|
||||
storage.delete(&upload_target).await?;
|
||||
assert!(storage.list().await?.is_empty());
|
||||
@@ -590,8 +826,7 @@ mod fs_tests {
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("does not exist"));
|
||||
let expected_path = upload_target.with_base(&storage.storage_root);
|
||||
assert!(error_string.contains(expected_path.to_str().unwrap()));
|
||||
assert!(error_string.contains(&upload_target.0));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -599,6 +834,8 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn file_with_metadata() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let metadata = StorageMetadata(HashMap::from([
|
||||
@@ -606,7 +843,7 @@ mod fs_tests {
|
||||
("two".to_string(), "2".to_string()),
|
||||
]));
|
||||
let upload_target =
|
||||
upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
|
||||
upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;
|
||||
|
||||
let full_range_download_contents =
|
||||
read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
|
||||
@@ -646,32 +883,23 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
workdir: &Path,
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let from_path = storage
|
||||
.storage_root
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join(name);
|
||||
) -> anyhow::Result<RemoteObjectId> {
|
||||
let timeline_path = workdir.join("timelines").join("some_timeline");
|
||||
let relative_timeline_path = timeline_path.strip_prefix(&workdir)?;
|
||||
let storage_path = storage.storage_root.join(relative_timeline_path).join(name);
|
||||
let remote_object_id = RemoteObjectId(storage_path.to_str().unwrap().to_string());
|
||||
|
||||
let from_path = storage.working_directory.join(name);
|
||||
let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?;
|
||||
|
||||
let relative_path = from_path
|
||||
.strip_prefix(&storage.storage_root)
|
||||
.context("Failed to strip storage root prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to resolve remote part of path {:?} for base {:?}",
|
||||
from_path, storage.storage_root
|
||||
)
|
||||
})?;
|
||||
|
||||
storage
|
||||
.upload(Box::new(file), size, &relative_path, metadata)
|
||||
.upload(Box::new(file), size, &remote_object_id, metadata)
|
||||
.await?;
|
||||
Ok(relative_path)
|
||||
remote_object_id_from_path(&storage_path)
|
||||
}
|
||||
|
||||
async fn create_file_for_upload(
|
||||
@@ -696,7 +924,7 @@ mod fs_tests {
|
||||
format!("contents for {name}")
|
||||
}
|
||||
|
||||
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
|
||||
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemoteObjectId>> {
|
||||
let mut files = storage.list().await?;
|
||||
files.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
Ok(files)
|
||||
|
||||
@@ -4,34 +4,27 @@
|
||||
//! allowing multiple api users to independently work with the same S3 bucket, if
|
||||
//! their bucket prefixes are both specified and different.
|
||||
|
||||
use std::env::var;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider, imds,
|
||||
imds::credentials::ImdsCredentialsProvider, meta::credentials::provide_credentials_fn,
|
||||
use rusoto_core::{
|
||||
credential::{InstanceMetadataProvider, StaticProvider},
|
||||
HttpClient, Region, RusotoError,
|
||||
};
|
||||
use aws_sdk_s3::{
|
||||
config::Config,
|
||||
error::{GetObjectError, GetObjectErrorKind},
|
||||
types::{ByteStream, SdkError},
|
||||
Client, Endpoint, Region,
|
||||
use rusoto_s3::{
|
||||
DeleteObjectRequest, GetObjectError, GetObjectRequest, ListObjectsV2Request, PutObjectRequest,
|
||||
S3Client, StreamingBody, S3,
|
||||
};
|
||||
use aws_smithy_http::body::SdkBody;
|
||||
use aws_types::credentials::{CredentialsError, ProvideCredentials};
|
||||
use hyper::Body;
|
||||
use tokio::{io, sync::Semaphore};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
strip_path_prefix, Download, DownloadError, RemoteObjectId, RemoteStorage, S3Config,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
const DEFAULT_IMDS_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
use super::StorageMetadata;
|
||||
|
||||
pub(super) mod metrics {
|
||||
use metrics::{register_int_counter_vec, IntCounterVec};
|
||||
@@ -98,9 +91,32 @@ pub(super) mod metrics {
|
||||
}
|
||||
}
|
||||
|
||||
fn download_destination(
|
||||
id: &RemoteObjectId,
|
||||
workdir: &Path,
|
||||
prefix_to_strip: Option<&str>,
|
||||
) -> PathBuf {
|
||||
let path_without_prefix = match prefix_to_strip {
|
||||
Some(prefix) => id.0.strip_prefix(prefix).unwrap_or_else(|| {
|
||||
panic!(
|
||||
"Could not strip prefix '{}' from S3 object key '{}'",
|
||||
prefix, id.0
|
||||
)
|
||||
}),
|
||||
None => &id.0,
|
||||
};
|
||||
|
||||
workdir.join(
|
||||
path_without_prefix
|
||||
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
.collect::<PathBuf>(),
|
||||
)
|
||||
}
|
||||
|
||||
/// AWS S3 storage.
|
||||
pub struct S3Bucket {
|
||||
client: Client,
|
||||
workdir: PathBuf,
|
||||
client: S3Client,
|
||||
bucket_name: String,
|
||||
prefix_in_bucket: Option<String>,
|
||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||
@@ -109,53 +125,50 @@ pub struct S3Bucket {
|
||||
concurrency_limiter: Semaphore,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct GetObjectRequest {
|
||||
bucket: String,
|
||||
key: String,
|
||||
range: Option<String>,
|
||||
}
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
|
||||
pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result<Self> {
|
||||
debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
aws_config.bucket_name
|
||||
);
|
||||
let mut config_builder = Config::builder()
|
||||
.region(Region::new(aws_config.bucket_region.clone()))
|
||||
.credentials_provider(provide_credentials_fn(|| async {
|
||||
match var("AWS_ACCESS_KEY_ID").is_ok() && var("AWS_SECRET_ACCESS_KEY").is_ok() {
|
||||
true => {
|
||||
EnvironmentVariableCredentialsProvider::new()
|
||||
.provide_credentials()
|
||||
.await
|
||||
}
|
||||
false => {
|
||||
let imds_client = imds::Client::builder()
|
||||
.connect_timeout(DEFAULT_IMDS_TIMEOUT)
|
||||
.read_timeout(DEFAULT_IMDS_TIMEOUT)
|
||||
.build()
|
||||
.await
|
||||
.map_err(CredentialsError::unhandled)?;
|
||||
ImdsCredentialsProvider::builder()
|
||||
.imds_client(imds_client)
|
||||
.build()
|
||||
.provide_credentials()
|
||||
.await
|
||||
}
|
||||
}
|
||||
}));
|
||||
let region = match aws_config.endpoint.clone() {
|
||||
Some(custom_endpoint) => Region::Custom {
|
||||
name: aws_config.bucket_region.clone(),
|
||||
endpoint: custom_endpoint,
|
||||
},
|
||||
None => aws_config
|
||||
.bucket_region
|
||||
.parse::<Region>()
|
||||
.context("Failed to parse the s3 region from config")?,
|
||||
};
|
||||
let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?;
|
||||
|
||||
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
|
||||
let endpoint = Endpoint::immutable(
|
||||
custom_endpoint
|
||||
.parse()
|
||||
.expect("Failed to parse S3 custom endpoint"),
|
||||
let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
|
||||
let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
|
||||
// session token is used when authorizing through sso
|
||||
// which is typically the case when testing locally on developer machine
|
||||
let session_token = std::env::var("AWS_SESSION_TOKEN").ok();
|
||||
|
||||
let client = if access_key_id.is_none() && secret_access_key.is_none() {
|
||||
debug!("Using IAM-based AWS access");
|
||||
S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
|
||||
} else {
|
||||
debug!(
|
||||
"Using credentials-based AWS access. Session token is set: {}",
|
||||
session_token.is_some()
|
||||
);
|
||||
config_builder.set_endpoint_resolver(Some(Arc::new(endpoint)));
|
||||
}
|
||||
let client = Client::from_conf(config_builder.build());
|
||||
S3Client::new_with(
|
||||
request_dispatcher,
|
||||
StaticProvider::new(
|
||||
access_key_id.unwrap_or_default(),
|
||||
secret_access_key.unwrap_or_default(),
|
||||
session_token,
|
||||
None,
|
||||
),
|
||||
region,
|
||||
)
|
||||
};
|
||||
|
||||
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
|
||||
let mut prefix = prefix;
|
||||
@@ -169,41 +182,16 @@ impl S3Bucket {
|
||||
}
|
||||
prefix
|
||||
});
|
||||
|
||||
Ok(Self {
|
||||
client,
|
||||
workdir,
|
||||
bucket_name: aws_config.bucket_name.clone(),
|
||||
prefix_in_bucket,
|
||||
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
|
||||
})
|
||||
}
|
||||
|
||||
fn s3_object_to_relative_path(&self, key: &str) -> RemotePath {
|
||||
let relative_path =
|
||||
match key.strip_prefix(self.prefix_in_bucket.as_deref().unwrap_or_default()) {
|
||||
Some(stripped) => stripped,
|
||||
// we rely on AWS to return properly prefixed paths
|
||||
// for requests with a certain prefix
|
||||
None => panic!(
|
||||
"Key {} does not start with bucket prefix {:?}",
|
||||
key, self.prefix_in_bucket
|
||||
),
|
||||
};
|
||||
RemotePath(
|
||||
relative_path
|
||||
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
||||
let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
|
||||
for segment in path.0.iter() {
|
||||
full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
full_path.push_str(segment.to_str().unwrap_or_default());
|
||||
}
|
||||
full_path
|
||||
}
|
||||
|
||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
@@ -214,33 +202,20 @@ impl S3Bucket {
|
||||
|
||||
metrics::inc_get_object();
|
||||
|
||||
let get_object = self
|
||||
.client
|
||||
.get_object()
|
||||
.bucket(request.bucket)
|
||||
.key(request.key)
|
||||
.set_range(request.range)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
match get_object {
|
||||
Ok(object_output) => {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
Ok(Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(io::BufReader::new(
|
||||
object_output.body.into_async_read(),
|
||||
)),
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError {
|
||||
err:
|
||||
GetObjectError {
|
||||
kind: GetObjectErrorKind::NoSuchKey(..),
|
||||
..
|
||||
},
|
||||
..
|
||||
}) => Err(DownloadError::NotFound),
|
||||
match self.client.get_object(request).await {
|
||||
Ok(object_output) => match object_output.body {
|
||||
None => {
|
||||
metrics::inc_get_object_fail();
|
||||
Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"Got no body for the S3 object given"
|
||||
)))
|
||||
}
|
||||
Some(body) => Ok(Download {
|
||||
metadata: object_output.metadata.map(StorageMetadata),
|
||||
download_stream: Box::pin(io::BufReader::new(body.into_async_read())),
|
||||
}),
|
||||
},
|
||||
Err(RusotoError::Service(GetObjectError::NoSuchKey(_))) => Err(DownloadError::NotFound),
|
||||
Err(e) => {
|
||||
metrics::inc_get_object_fail();
|
||||
Err(DownloadError::Other(anyhow::anyhow!(
|
||||
@@ -253,7 +228,25 @@ impl S3Bucket {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId> {
|
||||
let relative_path = strip_path_prefix(&self.workdir, local_path)?;
|
||||
let mut key = self.prefix_in_bucket.clone().unwrap_or_default();
|
||||
for segment in relative_path {
|
||||
key.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
key.push_str(&segment.to_string_lossy());
|
||||
}
|
||||
Ok(RemoteObjectId(key))
|
||||
}
|
||||
|
||||
fn local_path(&self, storage_path: &RemoteObjectId) -> anyhow::Result<PathBuf> {
|
||||
Ok(download_destination(
|
||||
storage_path,
|
||||
&self.workdir,
|
||||
self.prefix_in_bucket.as_deref(),
|
||||
))
|
||||
}
|
||||
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>> {
|
||||
let mut document_keys = Vec::new();
|
||||
|
||||
let mut continuation_token = None;
|
||||
@@ -268,11 +261,12 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let fetch_response = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(self.prefix_in_bucket.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.send()
|
||||
.list_objects_v2(ListObjectsV2Request {
|
||||
bucket: self.bucket_name.clone(),
|
||||
prefix: self.prefix_in_bucket.clone(),
|
||||
continuation_token,
|
||||
..ListObjectsV2Request::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_list_objects_fail();
|
||||
@@ -283,7 +277,7 @@ impl RemoteStorage for S3Bucket {
|
||||
.contents
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))),
|
||||
.filter_map(|o| Some(RemoteObjectId(o.key?))),
|
||||
);
|
||||
|
||||
match fetch_response.continuation_token {
|
||||
@@ -297,10 +291,13 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
/// See the doc for `RemoteStorage::list_prefixes`
|
||||
/// Note: it wont include empty "directories"
|
||||
async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemoteObjectId>,
|
||||
) -> anyhow::Result<Vec<RemoteObjectId>> {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.map(|p| p.0.clone())
|
||||
.or_else(|| self.prefix_in_bucket.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
@@ -325,12 +322,13 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let fetch_response = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(list_prefix.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
|
||||
.send()
|
||||
.list_objects_v2(ListObjectsV2Request {
|
||||
bucket: self.bucket_name.clone(),
|
||||
prefix: list_prefix.clone(),
|
||||
continuation_token,
|
||||
delimiter: Some(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()),
|
||||
..ListObjectsV2Request::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_list_objects_fail();
|
||||
@@ -342,7 +340,7 @@ impl RemoteStorage for S3Bucket {
|
||||
.common_prefixes
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||
.filter_map(|o| Some(RemoteObjectId(o.prefix?))),
|
||||
);
|
||||
|
||||
match fetch_response.continuation_token {
|
||||
@@ -358,7 +356,7 @@ impl RemoteStorage for S3Bucket {
|
||||
&self,
|
||||
from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
to: &RemoteObjectId,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
@@ -368,18 +366,17 @@ impl RemoteStorage for S3Bucket {
|
||||
.context("Concurrency limiter semaphore got closed during S3 upload")?;
|
||||
|
||||
metrics::inc_put_object();
|
||||
|
||||
let body = Body::wrap_stream(ReaderStream::new(from));
|
||||
let bytes_stream = ByteStream::new(SdkBody::from(body));
|
||||
|
||||
self.client
|
||||
.put_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(to))
|
||||
.set_metadata(metadata.map(|m| m.0))
|
||||
.content_length(from_size_bytes.try_into()?)
|
||||
.body(bytes_stream)
|
||||
.send()
|
||||
.put_object(PutObjectRequest {
|
||||
body: Some(StreamingBody::new_with_size(
|
||||
ReaderStream::new(from),
|
||||
from_size_bytes,
|
||||
)),
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: to.0.to_owned(),
|
||||
metadata: metadata.map(|m| m.0),
|
||||
..PutObjectRequest::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_put_object_fail();
|
||||
@@ -388,10 +385,10 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError> {
|
||||
self.download_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
key: from.0.to_owned(),
|
||||
..GetObjectRequest::default()
|
||||
})
|
||||
.await
|
||||
@@ -399,7 +396,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
from: &RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
) -> Result<Download, DownloadError> {
|
||||
@@ -407,19 +404,20 @@ impl RemoteStorage for S3Bucket {
|
||||
// and needs both ends to be exclusive
|
||||
let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
|
||||
let range = Some(match end_inclusive {
|
||||
Some(end_inclusive) => format!("bytes={start_inclusive}-{end_inclusive}"),
|
||||
None => format!("bytes={start_inclusive}-"),
|
||||
Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
|
||||
None => format!("bytes={}-", start_inclusive),
|
||||
});
|
||||
|
||||
self.download_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
key: from.0.to_owned(),
|
||||
range,
|
||||
..GetObjectRequest::default()
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
async fn delete(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
@@ -429,10 +427,11 @@ impl RemoteStorage for S3Bucket {
|
||||
metrics::inc_delete_object();
|
||||
|
||||
self.client
|
||||
.delete_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(path))
|
||||
.send()
|
||||
.delete_object(DeleteObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: remote_object_id.0.to_owned(),
|
||||
..DeleteObjectRequest::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_delete_object_fail();
|
||||
@@ -441,3 +440,181 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::tempdir;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_download_destination() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let local_path = workdir.join("one").join("two").join("test_name");
|
||||
let relative_path = local_path.strip_prefix(&workdir)?;
|
||||
|
||||
let key = RemoteObjectId(format!(
|
||||
"{}{}",
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
relative_path
|
||||
.iter()
|
||||
.map(|segment| segment.to_str().unwrap())
|
||||
.collect::<Vec<_>>()
|
||||
.join(&REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()),
|
||||
));
|
||||
|
||||
assert_eq!(
|
||||
local_path,
|
||||
download_destination(&key, &workdir, None),
|
||||
"Download destination should consist of s3 path joined with the workdir prefix"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_path_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let segment_1 = "matching";
|
||||
let segment_2 = "file";
|
||||
let local_path = &workdir.join(segment_1).join(segment_2);
|
||||
|
||||
let storage = dummy_storage(workdir);
|
||||
|
||||
let expected_key = RemoteObjectId(format!(
|
||||
"{}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_1}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_2}",
|
||||
storage.prefix_in_bucket.as_deref().unwrap_or_default(),
|
||||
));
|
||||
|
||||
let actual_key = storage
|
||||
.remote_object_id(local_path)
|
||||
.expect("Matching path should map to S3 path normally");
|
||||
assert_eq!(
|
||||
expected_key,
|
||||
actual_key,
|
||||
"S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String {
|
||||
match storage.remote_object_id(mismatching_path) {
|
||||
Ok(wrong_key) => panic!(
|
||||
"Expected path '{}' to error, but got S3 key: {:?}",
|
||||
mismatching_path.display(),
|
||||
wrong_key,
|
||||
),
|
||||
Err(e) => e.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage = dummy_storage(workdir.clone());
|
||||
|
||||
let error_message = storage_path_error(&storage, &workdir);
|
||||
assert!(
|
||||
error_message.contains("Prefix and the path are equal"),
|
||||
"Message '{}' does not contain the required string",
|
||||
error_message
|
||||
);
|
||||
|
||||
let mismatching_path = PathBuf::from("somewhere").join("else");
|
||||
let error_message = storage_path_error(&storage, &mismatching_path);
|
||||
assert!(
|
||||
error_message.contains(mismatching_path.to_str().unwrap()),
|
||||
"Error should mention wrong path"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains(workdir.to_str().unwrap()),
|
||||
"Error should mention server workdir"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains("is not prefixed with"),
|
||||
"Message '{}' does not contain a required string",
|
||||
error_message
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_path_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage = dummy_storage(workdir.clone());
|
||||
let timeline_dir = workdir.join("timelines").join("test_timeline");
|
||||
let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?;
|
||||
|
||||
let s3_key = create_s3_key(
|
||||
&relative_timeline_path.join("not a metadata"),
|
||||
storage.prefix_in_bucket.as_deref(),
|
||||
);
|
||||
assert_eq!(
|
||||
download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()),
|
||||
storage
|
||||
.local_path(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta file"
|
||||
);
|
||||
|
||||
let s3_key = create_s3_key(
|
||||
&relative_timeline_path.join("metadata"),
|
||||
storage.prefix_in_bucket.as_deref(),
|
||||
);
|
||||
assert_eq!(
|
||||
download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()),
|
||||
storage
|
||||
.local_path(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote metadata file"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_destination_matches_original_path() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let original_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("some name");
|
||||
|
||||
let dummy_storage = dummy_storage(workdir);
|
||||
|
||||
let key = dummy_storage.remote_object_id(&original_path)?;
|
||||
let download_destination = dummy_storage.local_path(&key)?;
|
||||
|
||||
assert_eq!(
|
||||
original_path, download_destination,
|
||||
"'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn dummy_storage(workdir: PathBuf) -> S3Bucket {
|
||||
S3Bucket {
|
||||
workdir,
|
||||
client: S3Client::new("us-east-1".parse().unwrap()),
|
||||
bucket_name: "dummy-bucket".to_string(),
|
||||
prefix_in_bucket: Some("dummy_prefix/".to_string()),
|
||||
concurrency_limiter: Semaphore::new(1),
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> RemoteObjectId {
|
||||
RemoteObjectId(relative_file_path.iter().fold(
|
||||
prefix.unwrap_or_default().to_string(),
|
||||
|mut path_string, segment| {
|
||||
path_string.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
path_string.push_str(segment.to_str().unwrap());
|
||||
path_string
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
sentry = "0.29.0"
|
||||
async-trait = "0.1"
|
||||
anyhow = "1.0"
|
||||
bincode = "1.3"
|
||||
|
||||
@@ -34,7 +34,6 @@ pub mod sock_split;
|
||||
pub mod logging;
|
||||
|
||||
pub mod lock_file;
|
||||
pub mod pid_file;
|
||||
|
||||
// Misc
|
||||
pub mod accum;
|
||||
@@ -47,7 +46,6 @@ pub mod tcp_listener;
|
||||
pub mod nonblock;
|
||||
|
||||
// Default signal handling
|
||||
pub mod sentry_init;
|
||||
pub mod signals;
|
||||
|
||||
pub mod fs_ext;
|
||||
|
||||
@@ -1,133 +1,81 @@
|
||||
//! A module to create and read lock files.
|
||||
//! A module to create and read lock files. A lock file ensures that only one
|
||||
//! process is running at a time, in a particular directory.
|
||||
//!
|
||||
//! File locking is done using [`fcntl::flock`] exclusive locks.
|
||||
//! The only consumer of this module is currently [`pid_file`].
|
||||
//! See the module-level comment there for potential pitfalls
|
||||
//! with lock files that are used to store PIDs (pidfiles).
|
||||
//! File locking is done using [`fcntl::flock`], which means that holding the
|
||||
//! lock on file only prevents acquiring another lock on it; all other
|
||||
//! operations are still possible on files. Other process can still open, read,
|
||||
//! write, or remove the file, for example.
|
||||
//! If the file is removed while a process is holding a lock on it,
|
||||
//! the process that holds the lock does not get any error or notification.
|
||||
//! Furthermore, you can create a new file with the same name and lock the new file,
|
||||
//! while the old process is still running.
|
||||
//! Deleting the lock file while the locking process is still running is a bad idea!
|
||||
|
||||
use std::{
|
||||
fs,
|
||||
io::{Read, Write},
|
||||
ops::Deref,
|
||||
os::unix::prelude::AsRawFd,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
use std::{fs, os::unix::prelude::AsRawFd, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::{errno::Errno::EAGAIN, fcntl};
|
||||
use nix::fcntl;
|
||||
|
||||
use crate::crashsafe;
|
||||
|
||||
/// A handle to an open and unlocked, but not-yet-written lock file.
|
||||
/// Returned by [`create_exclusive`].
|
||||
#[must_use]
|
||||
pub struct UnwrittenLockFile {
|
||||
path: PathBuf,
|
||||
file: fs::File,
|
||||
pub enum LockCreationResult {
|
||||
Created {
|
||||
new_lock_contents: String,
|
||||
file: fs::File,
|
||||
},
|
||||
AlreadyLocked {
|
||||
existing_lock_contents: String,
|
||||
},
|
||||
CreationFailed(anyhow::Error),
|
||||
}
|
||||
|
||||
/// Returned by [`UnwrittenLockFile::write_content`].
|
||||
#[must_use]
|
||||
pub struct LockFileGuard(fs::File);
|
||||
|
||||
impl Deref for LockFileGuard {
|
||||
type Target = fs::File;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl UnwrittenLockFile {
|
||||
/// Replace the content of this lock file with the byte representation of `contents`.
|
||||
pub fn write_content(mut self, contents: String) -> anyhow::Result<LockFileGuard> {
|
||||
self.file
|
||||
.set_len(0)
|
||||
.context("Failed to truncate lockfile")?;
|
||||
self.file
|
||||
.write_all(contents.as_bytes())
|
||||
.with_context(|| format!("Failed to write '{contents}' contents into lockfile"))?;
|
||||
crashsafe::fsync_file_and_parent(&self.path).context("fsync lockfile")?;
|
||||
Ok(LockFileGuard(self.file))
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates and opens a lock file in the path, grabs an exclusive flock on it, and returns
|
||||
/// a handle that allows overwriting the locked file's content.
|
||||
///
|
||||
/// The exclusive lock is released when dropping the returned handle.
|
||||
///
|
||||
/// It is not an error if the file already exists.
|
||||
/// It is an error if the file is already locked.
|
||||
pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFile> {
|
||||
let lock_file = fs::OpenOptions::new()
|
||||
/// Creates a lock file in the path given and writes the given contents into the file.
|
||||
/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
|
||||
pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
|
||||
let lock_file = match fs::OpenOptions::new()
|
||||
.create(true) // O_CREAT
|
||||
.write(true)
|
||||
.open(lock_file_path)
|
||||
.context("open lock file")?;
|
||||
|
||||
let res = fcntl::flock(
|
||||
lock_file.as_raw_fd(),
|
||||
fcntl::FlockArg::LockExclusiveNonblock,
|
||||
);
|
||||
match res {
|
||||
Ok(()) => Ok(UnwrittenLockFile {
|
||||
path: lock_file_path.to_owned(),
|
||||
file: lock_file,
|
||||
}),
|
||||
Err(EAGAIN) => anyhow::bail!("file is already locked"),
|
||||
Err(e) => Err(e).context("flock error"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returned by [`read_and_hold_lock_file`].
|
||||
/// Check out the [`pid_file`] module for what the variants mean
|
||||
/// and potential caveats if the lock files that are used to store PIDs.
|
||||
pub enum LockFileRead {
|
||||
/// No file exists at the given path.
|
||||
NotExist,
|
||||
/// No other process held the lock file, so we grabbed an flock
|
||||
/// on it and read its contents.
|
||||
/// Release the flock by dropping the [`LockFileGuard`].
|
||||
NotHeldByAnyProcess(LockFileGuard, String),
|
||||
/// The file exists but another process was holding an flock on it.
|
||||
LockedByOtherProcess {
|
||||
not_locked_file: fs::File,
|
||||
content: String,
|
||||
},
|
||||
}
|
||||
|
||||
/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
|
||||
/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
|
||||
/// Check the [`LockFileRead`] variants for details.
|
||||
pub fn read_and_hold_lock_file(path: &Path) -> anyhow::Result<LockFileRead> {
|
||||
let res = fs::OpenOptions::new().read(true).open(path);
|
||||
let mut lock_file = match res {
|
||||
Ok(f) => f,
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist),
|
||||
_ => return Err(e).context("open lock file"),
|
||||
},
|
||||
.context("Failed to open lock file")
|
||||
{
|
||||
Ok(file) => file,
|
||||
Err(e) => return LockCreationResult::CreationFailed(e),
|
||||
};
|
||||
let res = fcntl::flock(
|
||||
|
||||
match fcntl::flock(
|
||||
lock_file.as_raw_fd(),
|
||||
fcntl::FlockArg::LockExclusiveNonblock,
|
||||
);
|
||||
// We need the content regardless of lock success / failure.
|
||||
// But, read it after flock so that, if it succeeded, the content is consistent.
|
||||
let mut content = String::new();
|
||||
lock_file
|
||||
.read_to_string(&mut content)
|
||||
.context("read lock file")?;
|
||||
match res {
|
||||
Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess(
|
||||
LockFileGuard(lock_file),
|
||||
content,
|
||||
)),
|
||||
Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess {
|
||||
not_locked_file: lock_file,
|
||||
content,
|
||||
}),
|
||||
Err(e) => Err(e).context("flock error"),
|
||||
) {
|
||||
Ok(()) => {
|
||||
match lock_file
|
||||
.set_len(0)
|
||||
.context("Failed to truncate lockfile")
|
||||
.and_then(|()| {
|
||||
fs::write(lock_file_path, &contents).with_context(|| {
|
||||
format!("Failed to write '{contents}' contents into lockfile")
|
||||
})
|
||||
})
|
||||
.and_then(|()| {
|
||||
crashsafe::fsync_file_and_parent(lock_file_path)
|
||||
.context("Failed to fsync lockfile")
|
||||
}) {
|
||||
Ok(()) => LockCreationResult::Created {
|
||||
new_lock_contents: contents,
|
||||
file: lock_file,
|
||||
},
|
||||
Err(e) => LockCreationResult::CreationFailed(e),
|
||||
}
|
||||
}
|
||||
Err(nix::errno::Errno::EAGAIN) => {
|
||||
match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
|
||||
Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
|
||||
existing_lock_contents,
|
||||
},
|
||||
Err(e) => LockCreationResult::CreationFailed(e),
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,165 +0,0 @@
|
||||
//! Abstraction to create & read pidfiles.
|
||||
//!
|
||||
//! A pidfile is a file in the filesystem that stores a process's PID.
|
||||
//! Its purpose is to implement a singleton behavior where only
|
||||
//! one process of some "kind" is supposed to be running at a given time.
|
||||
//! The "kind" is identified by the pidfile.
|
||||
//!
|
||||
//! During process startup, the process that is supposed to be a singleton
|
||||
//! must [claim][`claim_for_current_process`] the pidfile first.
|
||||
//! If that is unsuccessful, the process must not act as the singleton, i.e.,
|
||||
//! it must not access any of the resources that only the singleton may access.
|
||||
//!
|
||||
//! A common need is to signal a running singleton process, e.g., to make
|
||||
//! it shut down and exit.
|
||||
//! For that, we have to [`read`] the pidfile. The result of the `read` operation
|
||||
//! tells us if there is any singleton process, and if so, what PID it has.
|
||||
//! We can then proceed to signal it, although some caveats still apply.
|
||||
//! Read the function-level documentation of [`read`] for that.
|
||||
//!
|
||||
//! ## Never Remove Pidfiles
|
||||
//!
|
||||
//! It would be natural to assume that the process who claimed the pidfile
|
||||
//! should remove it upon exit to avoid leaving a stale pidfile in place.
|
||||
//! However, we already have a reliable way to detect staleness of the pidfile,
|
||||
//! i.e., the `flock` that [claiming][`claim_for_current_process`] puts on it.
|
||||
//!
|
||||
//! And further, removing pidfiles would introduce a **catastrophic race condition**
|
||||
//! where two processes are running that are supposed to be singletons.
|
||||
//! Suppose we were to remove our pidfile during process shutdown.
|
||||
//! Here is how the race plays out:
|
||||
//! - Suppose we have a service called `myservice` with pidfile `myservice.pidfile`.
|
||||
//! - Process `A` starts to shut down.
|
||||
//! - Process `B` is just starting up
|
||||
//! - It `open("myservice.pid", O_WRONLY|O_CREAT)` the file
|
||||
//! - It blocks on `flock`
|
||||
//! - Process `A` removes the pidfile as the last step of its shutdown procedure
|
||||
//! - `unlink("myservice.pid")
|
||||
//! - Process `A` exits
|
||||
//! - This releases its `flock` and unblocks `B`
|
||||
//! - Process `B` still has the file descriptor for `myservice.pid` open
|
||||
//! - Process `B` writes its PID into `myservice.pid`.
|
||||
//! - But the `myservice.pid` file has been unlinked, so, there is `myservice.pid`
|
||||
//! in the directory.
|
||||
//! - Process `C` starts
|
||||
//! - It `open("myservice.pid", O_WRONLY|O_CREAT)` which creates a new file (new inode)
|
||||
//! - It `flock`s the file, which, since it's a different file, does not block
|
||||
//! - It writes its PID into the file
|
||||
//!
|
||||
//! At this point, `B` and `C` are running, which is hazardous.
|
||||
//! Morale of the story: don't unlink pidfiles, ever.
|
||||
|
||||
use std::{ops::Deref, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::unistd::Pid;
|
||||
|
||||
use crate::lock_file::{self, LockFileRead};
|
||||
|
||||
/// Keeps a claim on a pidfile alive until it is dropped.
|
||||
/// Returned by [`claim_for_current_process`].
|
||||
#[must_use]
|
||||
pub struct PidFileGuard(lock_file::LockFileGuard);
|
||||
|
||||
impl Deref for PidFileGuard {
|
||||
type Target = lock_file::LockFileGuard;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to claim `path` as a pidfile for the current process.
|
||||
///
|
||||
/// If another process has already claimed the pidfile, and it is still running,
|
||||
/// this function returns ane error.
|
||||
/// Otherwise, the function `flock`s the file and updates its contents to the
|
||||
/// current process's PID.
|
||||
/// If the update fails, the flock is released and an error returned.
|
||||
/// On success, the function returns a [`PidFileGuard`] to keep the flock alive.
|
||||
///
|
||||
/// ### Maintaining A Claim
|
||||
///
|
||||
/// It is the caller's responsibility to maintain the claim.
|
||||
/// The claim ends as soon as the returned guard object is dropped.
|
||||
/// To maintain the claim for the remaining lifetime of the current process,
|
||||
/// use [`std::mem::forget`] or similar.
|
||||
pub fn claim_for_current_process(path: &Path) -> anyhow::Result<PidFileGuard> {
|
||||
let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
|
||||
// if any of the next steps fail, we drop the file descriptor and thereby release the lock
|
||||
let guard = unwritten_lock_file
|
||||
.write_content(Pid::this().to_string())
|
||||
.context("write pid to lock file")?;
|
||||
Ok(PidFileGuard(guard))
|
||||
}
|
||||
|
||||
/// Returned by [`read`].
|
||||
pub enum PidFileRead {
|
||||
/// No file exists at the given path.
|
||||
NotExist,
|
||||
/// The given pidfile is currently not claimed by any process.
|
||||
/// To determine this, the [`read`] operation acquired
|
||||
/// an exclusive flock on the file. The lock is still held and responsibility
|
||||
/// to release it is returned through the guard object.
|
||||
/// Before releasing it, other [`claim_for_current_process`] or [`read`] calls
|
||||
/// will fail.
|
||||
///
|
||||
/// ### Caveats
|
||||
///
|
||||
/// Do not unlink the pidfile from the filesystem. See module-comment for why.
|
||||
NotHeldByAnyProcess(PidFileGuard),
|
||||
/// The given pidfile is still claimed by another process whose PID is given
|
||||
/// as part of this variant.
|
||||
///
|
||||
/// ### Caveats
|
||||
///
|
||||
/// 1. The other process might exit at any time, turning the given PID stale.
|
||||
/// 2. There is a small window in which `claim_for_current_process` has already
|
||||
/// locked the file but not yet updates its contents. [`read`] will return
|
||||
/// this variant here, but with the old file contents, i.e., a stale PID.
|
||||
///
|
||||
/// The kernel is free to recycle PID once it has been `wait(2)`ed upon by
|
||||
/// its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill`
|
||||
/// system call on it, bears the risk of killing an unrelated process.
|
||||
/// This is an inherent limitation of using pidfiles.
|
||||
/// The only race-free solution is to have a supervisor-process with a lifetime
|
||||
/// that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`).
|
||||
LockedByOtherProcess(Pid),
|
||||
}
|
||||
|
||||
/// Try to read the file at the given path as a pidfile that was previously created
|
||||
/// through [`claim_for_current_process`].
|
||||
///
|
||||
/// On success, this function returns a [`PidFileRead`].
|
||||
/// Check its docs for a description of the meaning of its different variants.
|
||||
pub fn read(pidfile: &Path) -> anyhow::Result<PidFileRead> {
|
||||
let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?;
|
||||
let ret = match res {
|
||||
LockFileRead::NotExist => PidFileRead::NotExist,
|
||||
LockFileRead::NotHeldByAnyProcess(guard, _) => {
|
||||
PidFileRead::NotHeldByAnyProcess(PidFileGuard(guard))
|
||||
}
|
||||
LockFileRead::LockedByOtherProcess {
|
||||
not_locked_file: _not_locked_file,
|
||||
content,
|
||||
} => {
|
||||
// XXX the read races with the write in claim_pid_file_for_pid().
|
||||
// But pids are smaller than a page, so the kernel page cache will lock for us.
|
||||
// The only problem is that we might get the old contents here.
|
||||
// Can only fix that by implementing some scheme that downgrades the
|
||||
// exclusive lock to shared lock in claim_pid_file_for_pid().
|
||||
PidFileRead::LockedByOtherProcess(parse_pidfile_content(&content)?)
|
||||
}
|
||||
};
|
||||
Ok(ret)
|
||||
}
|
||||
|
||||
fn parse_pidfile_content(content: &str) -> anyhow::Result<Pid> {
|
||||
let pid: i32 = content
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("parse pidfile content to PID"))?;
|
||||
if pid < 1 {
|
||||
anyhow::bail!("bad value in pidfile '{pid}'");
|
||||
}
|
||||
Ok(Pid::from_raw(pid))
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
use sentry::ClientInitGuard;
|
||||
use std::borrow::Cow;
|
||||
use std::env;
|
||||
|
||||
pub use sentry::release_name;
|
||||
|
||||
#[must_use]
|
||||
pub fn init_sentry(
|
||||
release_name: Option<Cow<'static, str>>,
|
||||
extra_options: &[(&str, &str)],
|
||||
) -> Option<ClientInitGuard> {
|
||||
let dsn = env::var("SENTRY_DSN").ok()?;
|
||||
|
||||
let guard = sentry::init((
|
||||
dsn,
|
||||
sentry::ClientOptions {
|
||||
release: release_name,
|
||||
..Default::default()
|
||||
},
|
||||
));
|
||||
sentry::configure_scope(|scope| {
|
||||
for &(key, value) in extra_options {
|
||||
scope.set_extra(key, value.into());
|
||||
}
|
||||
});
|
||||
Some(guard)
|
||||
}
|
||||
@@ -5,6 +5,10 @@ edition = "2021"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
profiling = ["pprof"]
|
||||
|
||||
[dependencies]
|
||||
@@ -14,13 +18,13 @@ async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
byteorder = "1.4.3"
|
||||
bytes = "1.0.1"
|
||||
chrono = { version = "0.4.23", default-features = false, features = ["clock"] }
|
||||
chrono = "0.4.19"
|
||||
clap = { version = "4.0", features = ["string"] }
|
||||
close_fds = "0.3.2"
|
||||
const_format = "0.2.21"
|
||||
crc32c = "0.6.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = { version = "0.5", default-features = false, features = ["failpoints"] }
|
||||
fail = "0.5.0"
|
||||
futures = "0.3.13"
|
||||
git-version = "0.3.5"
|
||||
hex = "0.4.3"
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
## Pageserver Benchmarks
|
||||
|
||||
# How to run
|
||||
|
||||
To run all benchmarks:
|
||||
`cargo bench`
|
||||
|
||||
To run a specific file:
|
||||
`cargo bench --bench bench_layer_map`
|
||||
|
||||
To run a specific function:
|
||||
`cargo bench --bench bench_layer_map -- real_map_uniform_queries`
|
||||
File diff suppressed because it is too large
Load Diff
@@ -431,7 +431,7 @@ fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
|
||||
struct Request {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -12,6 +12,7 @@
|
||||
//!
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::io;
|
||||
@@ -21,7 +22,6 @@ use std::time::SystemTime;
|
||||
use tar::{Builder, EntryType, Header};
|
||||
use tracing::*;
|
||||
|
||||
use crate::fail_point;
|
||||
use crate::tenant::Timeline;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
|
||||
|
||||
@@ -11,8 +11,8 @@
|
||||
//!
|
||||
//! Example use:
|
||||
//! ```
|
||||
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||
//! $ grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
|
||||
//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
|
||||
//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
|
||||
//! $ firefox out.svg
|
||||
//! ```
|
||||
//!
|
||||
@@ -25,8 +25,6 @@ use anyhow::Result;
|
||||
use pageserver::repository::Key;
|
||||
use std::cmp::Ordering;
|
||||
use std::io::{self, BufRead};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::{
|
||||
collections::{BTreeMap, BTreeSet},
|
||||
ops::Range,
|
||||
@@ -67,11 +65,7 @@ fn main() -> Result<()> {
|
||||
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
|
||||
let stdin = io::stdin();
|
||||
for line in stdin.lock().lines() {
|
||||
let line = line.unwrap();
|
||||
let line = PathBuf::from_str(&line).unwrap();
|
||||
let filename = line.file_name().unwrap();
|
||||
let filename = filename.to_str().unwrap();
|
||||
let range = parse_filename(filename);
|
||||
let range = parse_filename(&line.unwrap());
|
||||
ranges.push(range);
|
||||
}
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
|
||||
use anyhow::{anyhow, Context};
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use fail::FailScenario;
|
||||
use nix::unistd::Pid;
|
||||
use tracing::*;
|
||||
|
||||
use metrics::set_build_info_metric;
|
||||
@@ -22,10 +23,9 @@ use pageserver::{
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
logging,
|
||||
lock_file, logging,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
sentry_init::{init_sentry, release_name},
|
||||
signals::{self, Signal},
|
||||
tcp_listener,
|
||||
};
|
||||
@@ -35,6 +35,10 @@ project_git_version!(GIT_VERSION);
|
||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
|
||||
const FEATURES: &[&str] = &[
|
||||
#[cfg(feature = "testing")]
|
||||
"testing",
|
||||
#[cfg(feature = "fail/failpoints")]
|
||||
"fail/failpoints",
|
||||
#[cfg(feature = "profiling")]
|
||||
"profiling",
|
||||
];
|
||||
@@ -81,9 +85,6 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
|
||||
|
||||
let tenants_path = conf.tenants_path();
|
||||
if !tenants_path.exists() {
|
||||
utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
|
||||
@@ -174,10 +175,6 @@ fn initialize_config(
|
||||
let conf = PageServerConf::parse_and_validate(&toml, workdir)
|
||||
.context("Failed to parse pageserver configuration")?;
|
||||
|
||||
if pageserver::TESTING_MODE.set(conf.testing_mode).is_err() {
|
||||
anyhow::bail!("testing_mode was already initialized");
|
||||
}
|
||||
|
||||
if update_config {
|
||||
info!("Writing pageserver config to '{}'", cfg_file_path.display());
|
||||
|
||||
@@ -206,32 +203,41 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
// print them to the log for debugging purposes
|
||||
if *pageserver::TESTING_MODE.get().unwrap() {
|
||||
let failpoints = fail::list();
|
||||
if !failpoints.is_empty() {
|
||||
info!(
|
||||
"started with testing mode enabled, failpoints: {}",
|
||||
failpoints
|
||||
.iter()
|
||||
.map(|(name, actions)| format!("{name}={actions}"))
|
||||
.collect::<Vec<String>>()
|
||||
.join(";")
|
||||
)
|
||||
} else {
|
||||
info!("started with testing mode enabled");
|
||||
}
|
||||
} else {
|
||||
info!("started with testing mode disabled");
|
||||
let failpoints = fail::list();
|
||||
if !failpoints.is_empty() {
|
||||
info!(
|
||||
"started with failpoints: {}",
|
||||
failpoints
|
||||
.iter()
|
||||
.map(|(name, actions)| format!("{name}={actions}"))
|
||||
.collect::<Vec<String>>()
|
||||
.join(";")
|
||||
)
|
||||
}
|
||||
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
let lock_file =
|
||||
utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
|
||||
info!("Claimed pid file at {lock_file_path:?}");
|
||||
|
||||
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
|
||||
lock_file::LockCreationResult::Created {
|
||||
new_lock_contents,
|
||||
file,
|
||||
} => {
|
||||
info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
|
||||
file
|
||||
}
|
||||
lock_file::LockCreationResult::AlreadyLocked {
|
||||
existing_lock_contents,
|
||||
} => anyhow::bail!(
|
||||
"Could not lock pid file; pageserver is already running in {:?} with PID {}",
|
||||
conf.workdir,
|
||||
existing_lock_contents
|
||||
),
|
||||
lock_file::LockCreationResult::CreationFailed(e) => {
|
||||
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
|
||||
}
|
||||
};
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
std::mem::forget(lock_file);
|
||||
let _ = Box::leak(Box::new(lock_file));
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
@@ -286,23 +292,15 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
let remote_storage = conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.map(GenericRemoteStorage::from_config)
|
||||
.map(|storage_config| {
|
||||
GenericRemoteStorage::from_config(conf.workdir.clone(), storage_config)
|
||||
})
|
||||
.transpose()
|
||||
.context("Failed to init generic remote storage")?;
|
||||
|
||||
let (init_result_sender, init_result_receiver) =
|
||||
std::sync::mpsc::channel::<anyhow::Result<()>>();
|
||||
let storage_for_spawn = remote_storage.clone();
|
||||
let _handler = BACKGROUND_RUNTIME.spawn(async move {
|
||||
let result = tenant_mgr::init_tenant_mgr(conf, storage_for_spawn).await;
|
||||
init_result_sender.send(result)
|
||||
});
|
||||
match init_result_receiver.recv() {
|
||||
Ok(init_result) => init_result.context("Failed to init tenant_mgr")?,
|
||||
Err(_sender_dropped_err) => {
|
||||
anyhow::bail!("Failed to init tenant_mgr: no init status was returned");
|
||||
}
|
||||
}
|
||||
{
|
||||
let _rt_guard = BACKGROUND_RUNTIME.enter();
|
||||
tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?
|
||||
};
|
||||
|
||||
// Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
|
||||
// bind before launching separate thread so the error reported before startup exits
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
//! See also `settings.md` for better description on every parameter.
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use std::env;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::ConnectionId;
|
||||
@@ -27,9 +27,7 @@ use utils::{
|
||||
|
||||
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
};
|
||||
use crate::{METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX};
|
||||
|
||||
pub mod defaults {
|
||||
use crate::tenant_config::defaults::*;
|
||||
@@ -53,8 +51,6 @@ pub mod defaults {
|
||||
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
|
||||
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
|
||||
|
||||
pub const DEFAULT_TESTING_MODE: bool = false;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -77,8 +73,6 @@ pub mod defaults {
|
||||
|
||||
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
|
||||
|
||||
testing_mode = false
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -147,9 +141,6 @@ pub struct PageServerConf {
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
|
||||
/// Enables failpoint support and extra mgmt APIs useful for testing.
|
||||
pub testing_mode: bool,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -229,8 +220,6 @@ struct PageServerConfigBuilder {
|
||||
log_format: BuilderValue<LogFormat>,
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
|
||||
|
||||
testing_mode: BuilderValue<bool>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -261,8 +250,6 @@ impl Default for PageServerConfigBuilder {
|
||||
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
|
||||
|
||||
testing_mode: Set(DEFAULT_TESTING_MODE),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -343,11 +330,11 @@ impl PageServerConfigBuilder {
|
||||
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
|
||||
}
|
||||
|
||||
pub fn testing_mode(&mut self, testing_mode: bool) {
|
||||
self.testing_mode = BuilderValue::Set(testing_mode);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let broker_endpoints = self
|
||||
.broker_endpoints
|
||||
.ok_or(anyhow!("No broker endpoints provided"))?;
|
||||
|
||||
Ok(PageServerConf {
|
||||
listen_pg_addr: self
|
||||
.listen_pg_addr
|
||||
@@ -383,9 +370,7 @@ impl PageServerConfigBuilder {
|
||||
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints: self
|
||||
.broker_endpoints
|
||||
.ok_or(anyhow!("No broker endpoints provided"))?,
|
||||
broker_endpoints,
|
||||
broker_etcd_prefix: self
|
||||
.broker_etcd_prefix
|
||||
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
|
||||
@@ -395,7 +380,6 @@ impl PageServerConfigBuilder {
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?,
|
||||
testing_mode: self.testing_mode.ok_or(anyhow!("missing testing_mode"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -418,10 +402,6 @@ impl PageServerConf {
|
||||
.join(TENANT_ATTACHING_MARKER_FILENAME)
|
||||
}
|
||||
|
||||
pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf {
|
||||
self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
|
||||
@@ -470,28 +450,6 @@ impl PageServerConf {
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Files on the remote storage are stored with paths, relative to the workdir.
|
||||
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
|
||||
///
|
||||
/// Errors if the path provided does not start from pageserver's workdir.
|
||||
pub fn remote_path(&self, local_path: &Path) -> anyhow::Result<RemotePath> {
|
||||
local_path
|
||||
.strip_prefix(&self.workdir)
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to resolve remote part of path {:?} for base {:?}",
|
||||
local_path, self.workdir
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Turns storage remote path of a file into its local path.
|
||||
pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
|
||||
remote_path.with_base(&self.workdir)
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
@@ -528,7 +486,7 @@ impl PageServerConf {
|
||||
let mut builder = PageServerConfigBuilder::default();
|
||||
builder.workdir(workdir.to_owned());
|
||||
|
||||
let mut t_conf = TenantConfOpt::default();
|
||||
let mut t_conf: TenantConfOpt = Default::default();
|
||||
|
||||
for (key, item) in toml.iter() {
|
||||
match key {
|
||||
@@ -576,7 +534,6 @@ impl PageServerConf {
|
||||
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
|
||||
ConfigurableSemaphore::new(permits)
|
||||
}),
|
||||
"testing_mode" => builder.testing_mode(parse_toml_bool(key, item)?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -660,10 +617,6 @@ impl PageServerConf {
|
||||
if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
|
||||
t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
|
||||
}
|
||||
if let Some(trace_read_requests) = item.get("trace_read_requests") {
|
||||
t_conf.trace_read_requests =
|
||||
Some(parse_toml_bool("trace_read_requests", trace_read_requests)?);
|
||||
}
|
||||
|
||||
Ok(t_conf)
|
||||
}
|
||||
@@ -696,7 +649,6 @@ impl PageServerConf {
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
testing_mode: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -710,11 +662,6 @@ fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
|
||||
Ok(s.to_string())
|
||||
}
|
||||
|
||||
fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
|
||||
item.as_bool()
|
||||
.with_context(|| format!("configure option {name} is not a boolean"))
|
||||
}
|
||||
|
||||
fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
|
||||
// A toml integer is signed, so it cannot represent the full range of an u64. That's OK
|
||||
// for our use, though.
|
||||
@@ -891,7 +838,6 @@ log_format = 'json'
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
testing_mode: defaults::DEFAULT_TESTING_MODE,
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -938,7 +884,6 @@ log_format = 'json'
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::Json,
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
testing_mode: defaults::DEFAULT_TESTING_MODE,
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -1071,35 +1016,6 @@ broker_endpoints = ['{broker_endpoint}']
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_tenant_config() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
let trace_read_requests = true;
|
||||
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
pg_distrib_dir='{}'
|
||||
broker_endpoints = ['{broker_endpoint}']
|
||||
|
||||
[tenant_config]
|
||||
trace_read_requests = {trace_read_requests}"#,
|
||||
pg_distrib_dir.display(),
|
||||
);
|
||||
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
|
||||
assert_eq!(
|
||||
conf.default_tenant_conf.trace_read_requests, trace_read_requests,
|
||||
"Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
|
||||
let tempdir_path = tempdir.path();
|
||||
|
||||
|
||||
@@ -274,7 +274,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
|
||||
post:
|
||||
description: Schedules attach operation to happen in the background for given tenant
|
||||
responses:
|
||||
@@ -326,9 +325,7 @@ paths:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: |
|
||||
Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
|
||||
Files on the remote storage are not affected.
|
||||
description: Detach local tenant
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant detached
|
||||
@@ -357,92 +354,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/ignore:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: |
|
||||
Remove tenant data (including all corresponding timelines) from pageserver's memory.
|
||||
Files on local disk and remote storage are not affected.
|
||||
|
||||
Future pageserver restarts won't load the data back until `load` is called on such tenant.
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant ignored
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/load:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: |
|
||||
Schedules an operation that attempts to load a tenant from the local disk and
|
||||
synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
|
||||
If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
|
||||
|
||||
Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
|
||||
Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/size:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -748,6 +659,7 @@ components:
|
||||
- tenant_id
|
||||
- last_record_lsn
|
||||
- disk_consistent_lsn
|
||||
- awaits_download
|
||||
- state
|
||||
- latest_gc_cutoff_lsn
|
||||
properties:
|
||||
@@ -790,6 +702,8 @@ components:
|
||||
format: hex
|
||||
last_received_msg_ts:
|
||||
type: integer
|
||||
awaits_download:
|
||||
type: boolean
|
||||
state:
|
||||
type: string
|
||||
latest_gc_cutoff_lsn:
|
||||
|
||||
@@ -3,18 +3,18 @@ use std::sync::Arc;
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::task::JoinError;
|
||||
use tracing::*;
|
||||
|
||||
use super::models::{
|
||||
ConfigureFailpointsRequest, LocalTimelineInfo, RemoteTimelineInfo, StatusResponse,
|
||||
TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
|
||||
TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{config::PageServerConf, tenant_mgr};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
@@ -29,6 +29,12 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
// Imports only used for testing APIs
|
||||
#[cfg(feature = "testing")]
|
||||
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
|
||||
#[cfg(feature = "testing")]
|
||||
use crate::CheckpointConfig;
|
||||
|
||||
struct State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
@@ -76,11 +82,12 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
|
||||
|
||||
// Helper function to construct a TimelineInfo struct for a timeline
|
||||
fn build_timeline_info(
|
||||
tenant_state: TenantState,
|
||||
timeline: &Arc<Timeline>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
include_non_incremental_physical_size: bool,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let mut info = build_timeline_info_common(timeline)?;
|
||||
let mut info = build_timeline_info_common(tenant_state, timeline)?;
|
||||
if include_non_incremental_logical_size {
|
||||
info.current_logical_size_non_incremental =
|
||||
Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
|
||||
@@ -92,7 +99,10 @@ fn build_timeline_info(
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<TimelineInfo> {
|
||||
fn build_timeline_info_common(
|
||||
tenant_state: TenantState,
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
||||
let guard = timeline.last_received_wal.lock().unwrap();
|
||||
@@ -144,6 +154,10 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
|
||||
|
||||
state,
|
||||
|
||||
// XXX bring back tracking of downloads per timeline, or, introduce
|
||||
// an 'Attaching' state for the timeline and get rid of this field.
|
||||
awaits_download: tenant_state == TenantState::Attaching,
|
||||
|
||||
// Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
|
||||
// with the control plane.
|
||||
local: LocalTimelineInfo {
|
||||
@@ -175,9 +189,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
.new_timeline_id
|
||||
.unwrap_or_else(TimelineId::generate);
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
@@ -188,7 +200,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
.await {
|
||||
Ok(Some(new_timeline)) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(&new_timeline)
|
||||
let timeline_info = build_timeline_info_common(tenant.current_state(), &new_timeline)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
@@ -205,29 +217,26 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
query_param_present(&request, "include-non-incremental-physical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let response_data = async {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timelines = tenant.list_timelines();
|
||||
let _entered = info_span!("timeline_list", tenant = %tenant_id).entered();
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)
|
||||
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let (tenant_state, timelines) = {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
(tenant.current_state(), tenant.list_timelines())
|
||||
};
|
||||
|
||||
response_data.push(timeline_info);
|
||||
}
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
let timeline_info = build_timeline_info(
|
||||
tenant_state,
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)
|
||||
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
Ok(response_data)
|
||||
response_data.push(timeline_info);
|
||||
}
|
||||
.instrument(info_span!("timeline_list", tenant = %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
@@ -272,15 +281,20 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_info = async {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let (tenant_state, timeline) = tokio::task::spawn_blocking(move || {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
Ok((
|
||||
tenant.current_state(),
|
||||
tenant.get_timeline(timeline_id, false),
|
||||
))
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timeline = timeline.map_err(ApiError::NotFound)?;
|
||||
|
||||
let timeline_info = build_timeline_info(
|
||||
tenant_state,
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
@@ -308,7 +322,6 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
|
||||
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
||||
|
||||
let timeline = tenant_mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let result = match timeline
|
||||
@@ -334,13 +347,13 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
if let Some(remote_storage) = &state.remote_storage {
|
||||
// FIXME: distinguish between "Tenant already exists" and other errors
|
||||
tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
|
||||
tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage)
|
||||
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
} else {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"attach_tenant is not possible because pageserver was configured without remote storage"
|
||||
"attach_tenant is possible because pageserver was configured without remote storage"
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -379,49 +392,23 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
tenant_mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
|
||||
.instrument(info_span!("load", tenant = %tenant_id))
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
tenant_mgr::ignore_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("ignore_tenant", tenant = %tenant_id))
|
||||
.await
|
||||
// FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
|
||||
// Replace this with better handling once the error type permits it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let response_data = tenant_mgr::list_tenants()
|
||||
.instrument(info_span!("tenant_list"))
|
||||
.await
|
||||
.iter()
|
||||
.map(|(id, state)| TenantInfo {
|
||||
id: *id,
|
||||
state: *state,
|
||||
current_physical_size: None,
|
||||
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>();
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_list").entered();
|
||||
tenant_mgr::list_tenants()
|
||||
.iter()
|
||||
.map(|(id, state)| TenantInfo {
|
||||
id: *id,
|
||||
state: *state,
|
||||
current_physical_size: None,
|
||||
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>()
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
@@ -430,8 +417,9 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant_info = async {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
|
||||
let tenant_info = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_status_handler", tenant = %tenant_id).entered();
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
|
||||
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
@@ -440,15 +428,17 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
}
|
||||
|
||||
let state = tenant.current_state();
|
||||
Ok(TenantInfo {
|
||||
let tenant_info = TenantInfo {
|
||||
id: tenant_id,
|
||||
state,
|
||||
current_physical_size: Some(current_physical_size),
|
||||
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
|
||||
})
|
||||
}
|
||||
.instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
|
||||
};
|
||||
|
||||
Ok::<_, anyhow::Error>(tenant_info)
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, tenant_info)
|
||||
@@ -458,9 +448,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// this can be long operation, it currently is not backed by any request coalescing or similar
|
||||
let inputs = tenant
|
||||
@@ -577,19 +565,22 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
.map(TenantId::from)
|
||||
.unwrap_or_else(TenantId::generate);
|
||||
|
||||
let state = get_state(&request);
|
||||
let new_tenant = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
|
||||
let state = get_state(&request);
|
||||
|
||||
let new_tenant = tenant_mgr::create_tenant(
|
||||
state.conf,
|
||||
tenant_conf,
|
||||
target_tenant_id,
|
||||
state.remote_storage.clone(),
|
||||
)
|
||||
.instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
|
||||
tenant_mgr::create_tenant(
|
||||
state.conf,
|
||||
tenant_conf,
|
||||
target_tenant_id,
|
||||
state.remote_storage.clone(),
|
||||
)
|
||||
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
|
||||
// with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)
|
||||
})
|
||||
.await
|
||||
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
|
||||
// with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
Ok(match new_tenant {
|
||||
Some(tenant) => {
|
||||
@@ -680,17 +671,22 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
);
|
||||
}
|
||||
|
||||
let state = get_state(&request);
|
||||
tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
||||
.await
|
||||
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
|
||||
// Replace this `map_err` with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered();
|
||||
|
||||
let state = get_state(&request);
|
||||
tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
|
||||
// Replace this `map_err` with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
@@ -724,6 +720,7 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -731,7 +728,7 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
||||
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
|
||||
let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req)?;
|
||||
let gc_result = wait_task_done
|
||||
.await
|
||||
.context("wait for gc task")
|
||||
@@ -742,14 +739,13 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
||||
}
|
||||
|
||||
// Run compaction immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
@@ -762,14 +758,13 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
|
||||
}
|
||||
|
||||
// Run checkpoint immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
@@ -806,26 +801,22 @@ pub fn make_router(
|
||||
}))
|
||||
}
|
||||
|
||||
// A wrapper around a handler function that returns an error if the server
|
||||
// was not configured with testing_mode enabled. This is used to gate API
|
||||
// functions that should only be used in tests, never in production.
|
||||
macro_rules! testing_api {
|
||||
($handler_desc:literal, $handler:path $(,)?) => {{
|
||||
use futures::FutureExt;
|
||||
|req: Request<Body>| {
|
||||
if conf.testing_mode {
|
||||
$handler(req).left_future()
|
||||
} else {
|
||||
async {
|
||||
Err(ApiError::BadRequest(anyhow!(concat!(
|
||||
"Cannot ",
|
||||
$handler_desc,
|
||||
" because pageserver was configured without testing APIs",
|
||||
))))
|
||||
}
|
||||
.right_future()
|
||||
}
|
||||
#[cfg(not(feature = "testing"))]
|
||||
async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
Err(ApiError::BadRequest(anyhow!(concat!(
|
||||
"Cannot ",
|
||||
$handler_desc,
|
||||
" because pageserver was compiled without testing APIs",
|
||||
))))
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
let handler = $handler;
|
||||
#[cfg(not(feature = "testing"))]
|
||||
let handler = cfg_disabled;
|
||||
handler
|
||||
}};
|
||||
}
|
||||
|
||||
@@ -847,8 +838,6 @@ pub fn make_router(
|
||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||
.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
|
||||
.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
|
||||
.post("/v1/tenant/:tenant_id/load", tenant_load_handler)
|
||||
.post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_detail_handler,
|
||||
|
||||
@@ -10,8 +10,7 @@ pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod profiling;
|
||||
pub mod repository;
|
||||
pub mod storage_sync2;
|
||||
pub use storage_sync2 as storage_sync;
|
||||
pub mod storage_sync;
|
||||
pub mod task_mgr;
|
||||
pub mod tenant;
|
||||
pub mod tenant_config;
|
||||
@@ -125,13 +124,6 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
||||
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
|
||||
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
||||
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
||||
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
|
||||
/// into pageserver's memory before being ignored.
|
||||
/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
|
||||
pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
|
||||
|
||||
pub fn is_temporary(path: &Path) -> bool {
|
||||
match path.file_name() {
|
||||
Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),
|
||||
@@ -148,28 +140,6 @@ pub fn is_uninit_mark(path: &Path) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Wrapper around fail::fail_point! macro that returns quickly if testing_mode was
|
||||
/// disabled in the pageserver config. Also enabled in unit tests.
|
||||
///
|
||||
/// fail::fail_point! is fairly quick, but it does acquire an RwLock and perform a HashMap
|
||||
/// lookup. This macro is hopefully cheap enough that we don't need to worry about the
|
||||
/// overhead even in production, and even if the macro is used in hot spots. (This check
|
||||
/// compiles to two cmp instructions; get_unchecked() would shrink it to one.)
|
||||
///
|
||||
#[macro_export]
|
||||
macro_rules! fail_point {
|
||||
($($name:expr),*) => {{
|
||||
if cfg!(test) || *$crate::TESTING_MODE.get().expect("testing_mode not initialized") {
|
||||
fail::fail_point!($($name), *)
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// This is set early in the pageserver startup, from the "testing_mode" setting in the
|
||||
/// config file.
|
||||
pub static TESTING_MODE: once_cell::sync::OnceCell<bool> = once_cell::sync::OnceCell::new();
|
||||
|
||||
#[cfg(test)]
|
||||
mod backoff_defaults_tests {
|
||||
use super::*;
|
||||
|
||||
@@ -315,7 +315,6 @@ impl PageServerHandler {
|
||||
|
||||
let copy_data_bytes = match msg? {
|
||||
Some(FeMessage::CopyData(bytes)) => bytes,
|
||||
Some(FeMessage::Terminate) => break,
|
||||
Some(m) => {
|
||||
bail!("unexpected message: {m:?} during COPY");
|
||||
}
|
||||
@@ -941,7 +940,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
/// ensures that queries don't fail immediately after pageserver startup, because
|
||||
/// all tenants are still loading.
|
||||
async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
|
||||
match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
|
||||
Ok(wait_result) => wait_result
|
||||
// no .context(), the error message is good enough and some tests depend on it
|
||||
|
||||
@@ -79,13 +79,6 @@
|
||||
//! - We rely on read-after write consistency in the remote storage.
|
||||
//! - Layer files are immutable
|
||||
//!
|
||||
//! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
|
||||
//! storage. Different tenants can be attached to different pageservers, but if the
|
||||
//! same tenant is attached to two pageservers at the same time, they will overwrite
|
||||
//! each other's index file updates, and confusion will ensue. There's no interlock or
|
||||
//! mechanism to detect that in the pageserver, we rely on the control plane to ensure
|
||||
//! that that doesn't happen.
|
||||
//!
|
||||
//! ## Implementation Note
|
||||
//!
|
||||
//! The *actual* remote state lags behind the *desired* remote state while
|
||||
@@ -152,10 +145,6 @@
|
||||
//!
|
||||
//! # Downloads (= Tenant Attach)
|
||||
//!
|
||||
//! In addition to the upload queue, [`RemoteTimelineClient`] has functions for
|
||||
//! downloading files from the remote storage. Downloads are performed immediately,
|
||||
//! independently of the uploads.
|
||||
//!
|
||||
//! When we attach a tenant, we perform the following steps:
|
||||
//! - create `Tenant` object in `TenantState::Attaching` state
|
||||
//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s
|
||||
@@ -185,6 +174,60 @@
|
||||
//! in remote storage.
|
||||
//! But note that we don't test any of this right now.
|
||||
//!
|
||||
//!
|
||||
//! # RANDOM NOTES FROM THE PAST (TODO: DELETE / DEDUP WITH CONTENT ABOVE)
|
||||
//!
|
||||
//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
|
||||
//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
|
||||
//!
|
||||
//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast.
|
||||
//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time.
|
||||
//!
|
||||
//! Uploads are queued and executed in the background and in parallel, enforcing the ordering rules.
|
||||
//! Downloads are performed immediately, and independently of the uploads.
|
||||
//!
|
||||
//! Deletion happens only after a successful upload only, otherwise the compaction output might make the timeline inconsistent until both tasks are fully processed without errors.
|
||||
//! Upload and download update the remote data (inmemory index and S3 json index part file) only after every layer is successfully synchronized, while the deletion task
|
||||
//! does otherwise: it requires to have the remote data updated first successfully: blob files will be invisible to pageserver this way.
|
||||
//!
|
||||
//! FIXME: how is the initial list of remote files created now? Update this paragraph
|
||||
//! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via downloading and merging the index data for all timelines,
|
||||
//! present locally.
|
||||
//! It's enough to poll such timelines' remote state once on startup only, due to an agreement that only one pageserver at a time has an exclusive
|
||||
//! write access to remote portion of timelines that are attached to the pagegserver.
|
||||
//! The index state is used to issue initial sync tasks, if needed:
|
||||
//! * all timelines with local state behind the remote gets download tasks scheduled.
|
||||
//! Such timelines are considered "remote" before the download succeeds, so a number of operations (gc, checkpoints) on that timeline are unavailable
|
||||
//! before up-to-date layers and metadata file are downloaded locally.
|
||||
//! * all newer local state gets scheduled for upload, such timelines are "local" and fully operational
|
||||
//! * remote timelines not present locally are unknown to pageserver, but can be downloaded on a separate request
|
||||
//!
|
||||
//! Then, the index is shared across pageserver under [`RemoteIndex`] guard to ensure proper synchronization.
|
||||
//! The remote index gets updated after very remote storage change (after an upload), same as the index part files remotely.
|
||||
//!
|
||||
//! Remote timeline contains a set of layer files, created during checkpoint(s) and the serialized [`IndexPart`] file with timeline metadata and all remote layer paths inside.
|
||||
//! Those paths are used instead of `S3 list` command to avoid its slowliness and expenciveness for big amount of files.
|
||||
//! If the index part does not contain some file path but it's present remotely, such file is invisible to pageserver and ignored.
|
||||
//! Among other tasks, the index is used to prevent invalid uploads and non-existing downloads on demand, refer to [`index`] for more details.
|
||||
//!
|
||||
//! FIXME: update this paragraph
|
||||
//! Index construction is currently the only place where the storage sync can return an [`Err`] to the user.
|
||||
//! New sync tasks are accepted via [`schedule_layer_upload`], [`schedule_layer_download`] and [`schedule_layer_delete`] functions.
|
||||
//! After the initial state is loaded into memory and the loop starts, any further [`Err`] results do not stop the loop, but rather
|
||||
//! reschedule the same task, with possibly less files to sync:
|
||||
//! * download tasks currently never replace existing local file with metadata file as an exception
|
||||
//! (but this is a subject to change when checksum checks are implemented: all files could get overwritten on a checksum mismatch)
|
||||
//! * download tasks carry the information of skipped acrhives, so resubmissions are not downloading successfully processed layers again
|
||||
//! * downloads do not contain any actual files to download, so that "external", sync pageserver code is able to schedule the timeline download
|
||||
//! without accessing any extra information about its files.
|
||||
//!
|
||||
//! FIXME: update this paragraph
|
||||
//! Uploads and downloads sync layer files in arbitrary order, but only after all layer files are synched the local metadada (for download) and remote index part (for upload) are updated,
|
||||
//! to avoid having a corrupt state without the relevant layer files.
|
||||
//! Refer to [`upload`] and [`download`] for more details.
|
||||
//!
|
||||
//! Synchronization never removes any local files from pageserver workdir or remote files from the remote storage, yet there could be overwrites of the same files (index part and metadata file updates, future checksum mismatch fixes).
|
||||
//! NOTE: No real contents or checksum check happens right now and is a subject to improve later.
|
||||
|
||||
mod delete;
|
||||
mod download;
|
||||
@@ -202,9 +245,9 @@ use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use anyhow::ensure;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use tokio::runtime::Runtime;
|
||||
use tracing::{info, warn};
|
||||
use tracing::{error, info, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
@@ -217,7 +260,7 @@ use crate::metrics::RemoteOpKind;
|
||||
use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
storage_sync::index::LayerFileMetadata,
|
||||
storage_sync::index::{LayerFileMetadata, RelativePath},
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::BACKGROUND_RUNTIME,
|
||||
@@ -287,7 +330,7 @@ struct UploadQueueInitialized {
|
||||
|
||||
/// All layer files stored in the remote storage, taking into account all
|
||||
/// in-progress and queued operations
|
||||
latest_files: HashMap<RemotePath, LayerFileMetadata>,
|
||||
latest_files: HashMap<RelativePath, LayerFileMetadata>,
|
||||
|
||||
/// Metadata stored in the remote storage, taking into account all
|
||||
/// in-progress and queued operations.
|
||||
@@ -337,18 +380,18 @@ impl UploadQueue {
|
||||
|
||||
let state = UploadQueueInitialized {
|
||||
// As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
|
||||
latest_files: HashMap::new(),
|
||||
latest_files: Default::default(),
|
||||
latest_metadata: metadata.clone(),
|
||||
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
|
||||
// safekeepers from garbage-collecting anything.
|
||||
last_uploaded_consistent_lsn: Lsn(0),
|
||||
// what follows are boring default initializations
|
||||
task_counter: 0,
|
||||
task_counter: Default::default(),
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::new(),
|
||||
queued_operations: VecDeque::new(),
|
||||
inprogress_tasks: Default::default(),
|
||||
queued_operations: Default::default(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
@@ -357,10 +400,6 @@ impl UploadQueue {
|
||||
|
||||
fn initialize_with_current_remote_index_part(
|
||||
&mut self,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
index_part: &IndexPart,
|
||||
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
match self {
|
||||
@@ -370,19 +409,14 @@ impl UploadQueue {
|
||||
}
|
||||
}
|
||||
|
||||
let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
for timeline_name in &index_part.timeline_layers {
|
||||
let local_path = timeline_path.join(timeline_name);
|
||||
let remote_timeline_path = conf.remote_path(&local_path).expect(
|
||||
"Remote timeline path and local timeline path were constructed form the same conf",
|
||||
);
|
||||
let mut files = HashMap::new();
|
||||
for path in &index_part.timeline_layers {
|
||||
let layer_metadata = index_part
|
||||
.layer_metadata
|
||||
.get(timeline_name)
|
||||
.get(path)
|
||||
.map(LayerFileMetadata::from)
|
||||
.unwrap_or(LayerFileMetadata::MISSING);
|
||||
files.insert(remote_timeline_path, layer_metadata);
|
||||
files.insert(path.clone(), layer_metadata);
|
||||
}
|
||||
|
||||
let index_part_metadata = index_part.parse_metadata()?;
|
||||
@@ -400,8 +434,8 @@ impl UploadQueue {
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::new(),
|
||||
queued_operations: VecDeque::new(),
|
||||
inprogress_tasks: Default::default(),
|
||||
queued_operations: Default::default(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
@@ -465,12 +499,7 @@ impl RemoteTimelineClient {
|
||||
/// The given `index_part` must be the one on the remote.
|
||||
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(
|
||||
self.conf,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
index_part,
|
||||
)?;
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -524,13 +553,15 @@ impl RemoteTimelineClient {
|
||||
/// On success, returns the size of the downloaded file.
|
||||
pub async fn download_layer_file(
|
||||
&self,
|
||||
remote_path: &RemotePath,
|
||||
path: &RelativePath,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
) -> anyhow::Result<u64> {
|
||||
let downloaded_size = download::download_layer_file(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
remote_path,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
path,
|
||||
layer_metadata,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -548,13 +579,13 @@ impl RemoteTimelineClient {
|
||||
let new_metadata = LayerFileMetadata::new(downloaded_size);
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
if let Some(upgraded) = upload_queue.latest_files.get_mut(remote_path) {
|
||||
if let Some(upgraded) = upload_queue.latest_files.get_mut(path) {
|
||||
upgraded.merge(&new_metadata);
|
||||
} else {
|
||||
// The file should exist, since we just downloaded it.
|
||||
warn!(
|
||||
"downloaded file {:?} not found in local copy of the index file",
|
||||
remote_path
|
||||
path
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -624,9 +655,14 @@ impl RemoteTimelineClient {
|
||||
"file size not initialized in metadata"
|
||||
);
|
||||
|
||||
let relative_path = RelativePath::from_local_path(
|
||||
&self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
path,
|
||||
)?;
|
||||
|
||||
upload_queue
|
||||
.latest_files
|
||||
.insert(self.conf.remote_path(path)?, layer_metadata.clone());
|
||||
.insert(relative_path, layer_metadata.clone());
|
||||
|
||||
let op = UploadOp::UploadLayer(PathBuf::from(path), layer_metadata.clone());
|
||||
self.update_upload_queue_unfinished_metric(1, &op);
|
||||
@@ -648,10 +684,13 @@ impl RemoteTimelineClient {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
// Convert the paths into RemotePaths, and gather other information we need.
|
||||
let mut remote_paths = Vec::with_capacity(paths.len());
|
||||
// Convert the paths into RelativePaths, and gather other information we need.
|
||||
let mut relative_paths = Vec::with_capacity(paths.len());
|
||||
for path in paths {
|
||||
remote_paths.push(self.conf.remote_path(path)?);
|
||||
relative_paths.push(RelativePath::from_local_path(
|
||||
&self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
path,
|
||||
)?);
|
||||
}
|
||||
|
||||
// Deleting layers doesn't affect the values stored in TimelineMetadata,
|
||||
@@ -667,8 +706,8 @@ impl RemoteTimelineClient {
|
||||
// from latest_files, but not yet scheduled for deletion. Use a closure
|
||||
// to syntactically forbid ? or bail! calls here.
|
||||
let no_bail_here = || {
|
||||
for remote_path in remote_paths {
|
||||
upload_queue.latest_files.remove(&remote_path);
|
||||
for relative_path in relative_paths {
|
||||
upload_queue.latest_files.remove(&relative_path);
|
||||
}
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
@@ -842,19 +881,14 @@ impl RemoteTimelineClient {
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref path, ref layer_metadata) => {
|
||||
upload::upload_timeline_layer(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
path,
|
||||
layer_metadata,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
)
|
||||
.await
|
||||
upload::upload_timeline_layer(&self.storage_impl, path, layer_metadata)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
)
|
||||
.await
|
||||
}
|
||||
UploadOp::UploadMetadata(ref index_part, _lsn) => {
|
||||
upload::upload_index_part(
|
||||
@@ -873,7 +907,7 @@ impl RemoteTimelineClient {
|
||||
.await
|
||||
}
|
||||
UploadOp::Delete(metric_file_kind, ref path) => {
|
||||
delete::delete_layer(self.conf, &self.storage_impl, path)
|
||||
delete::delete_layer(&self.storage_impl, path)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
@@ -897,20 +931,10 @@ impl RemoteTimelineClient {
|
||||
Err(e) => {
|
||||
let retries = task.retries.fetch_add(1, Ordering::SeqCst);
|
||||
|
||||
// uploads may fail due to rate limts (IAM, S3) or spurious network and external errors
|
||||
// such issues are relatively regular, so don't use WARN or ERROR to avoid alerting
|
||||
// people and tests until the retries are definitely causing delays.
|
||||
if retries < 3 {
|
||||
info!(
|
||||
"failed to perform remote task {}, will retry (attempt {}): {:?}",
|
||||
task.op, retries, e
|
||||
);
|
||||
} else {
|
||||
warn!(
|
||||
"failed to perform remote task {}, will retry (attempt {}): {:?}",
|
||||
task.op, retries, e
|
||||
);
|
||||
}
|
||||
error!(
|
||||
"failed to perform remote task {}, will retry (attempt {}): {:?}",
|
||||
task.op, retries, e
|
||||
);
|
||||
|
||||
// sleep until it's time to retry, or we're cancelled
|
||||
tokio::select! {
|
||||
@@ -975,8 +999,7 @@ impl RemoteTimelineClient {
|
||||
UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
|
||||
UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
|
||||
UploadOp::Barrier(_) => {
|
||||
// we do not account these
|
||||
return;
|
||||
unreachable!("we execute barriers synchronously")
|
||||
}
|
||||
};
|
||||
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
|
||||
@@ -1102,11 +1125,15 @@ mod tests {
|
||||
TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
|
||||
}
|
||||
|
||||
fn assert_file_list(a: &HashSet<String>, b: &[&str]) {
|
||||
let mut avec: Vec<&str> = a.iter().map(|a| a.as_str()).collect();
|
||||
fn assert_file_list(a: &HashSet<RelativePath>, b: &[&str]) {
|
||||
let xx = PathBuf::from("");
|
||||
let mut avec: Vec<String> = a
|
||||
.iter()
|
||||
.map(|x| x.to_local_path(&xx).to_string_lossy().into())
|
||||
.collect();
|
||||
avec.sort();
|
||||
|
||||
let mut bvec = b.to_vec();
|
||||
let mut bvec = b.to_owned();
|
||||
bvec.sort_unstable();
|
||||
|
||||
assert_eq!(avec, bvec);
|
||||
@@ -1174,7 +1201,8 @@ mod tests {
|
||||
|
||||
println!("workdir: {}", harness.conf.workdir.display());
|
||||
|
||||
let storage_impl = GenericRemoteStorage::from_config(&storage_config)?;
|
||||
let storage_impl =
|
||||
GenericRemoteStorage::from_config(harness.conf.workdir.clone(), &storage_config)?;
|
||||
let client = Arc::new(RemoteTimelineClient {
|
||||
conf: harness.conf,
|
||||
runtime,
|
||||
38
pageserver/src/storage_sync/delete.rs
Normal file
38
pageserver/src/storage_sync/delete.rs
Normal file
@@ -0,0 +1,38 @@
|
||||
//! Helper functions to delete files from remote storage with a RemoteStorage
|
||||
use anyhow::Context;
|
||||
use std::path::Path;
|
||||
use tracing::debug;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
pub(super) async fn delete_layer(
|
||||
storage: &GenericRemoteStorage,
|
||||
local_layer_path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
fail::fail_point!("before-delete-layer", |_| {
|
||||
anyhow::bail!("failpoint before-delete-layer")
|
||||
});
|
||||
debug!(
|
||||
"Deleting layer from remote storage: {:?}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
|
||||
let storage_path = storage
|
||||
.remote_object_id(local_layer_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
local_layer_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// XXX: If the deletion fails because the object already didn't exist,
|
||||
// it would be good to just issue a warning but consider it success.
|
||||
// https://github.com/neondatabase/neon/issues/2934
|
||||
storage.delete(&storage_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to delete remote layer from storage at '{:?}'",
|
||||
storage_path
|
||||
)
|
||||
})
|
||||
}
|
||||
@@ -10,11 +10,12 @@ use tracing::debug;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::storage_sync::index::LayerFileMetadata;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::IndexPart;
|
||||
use super::RelativePath;
|
||||
|
||||
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
@@ -28,10 +29,21 @@ async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Er
|
||||
pub async fn download_layer_file<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
remote_path: &'a RemotePath,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
path: &'a RelativePath,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
) -> anyhow::Result<u64> {
|
||||
let local_path = conf.local_path(remote_path);
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let local_path = path.to_local_path(&timeline_path);
|
||||
|
||||
let layer_storage_path = storage.remote_object_id(&local_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
local_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
@@ -52,13 +64,18 @@ pub async fn download_layer_file<'a>(
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut download = storage.download(remote_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
)
|
||||
})?;
|
||||
let mut download = storage
|
||||
.download(&layer_storage_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
|
||||
)
|
||||
})?;
|
||||
let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
|
||||
format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
|
||||
format!(
|
||||
"Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
@@ -97,7 +114,7 @@ pub async fn download_layer_file<'a>(
|
||||
})?;
|
||||
drop(destination_file);
|
||||
|
||||
crate::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
bail!("remote-storage-download-pre-rename failpoint triggered")
|
||||
});
|
||||
|
||||
@@ -134,7 +151,12 @@ pub async fn list_remote_timelines<'a>(
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
|
||||
let tenant_path = conf.timelines_path(&tenant_id);
|
||||
let tenant_storage_path = conf.remote_path(&tenant_path)?;
|
||||
let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get tenant storage path for local path '{}'",
|
||||
tenant_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let timelines = storage
|
||||
.list_prefixes(Some(&tenant_storage_path))
|
||||
@@ -196,8 +218,14 @@ pub async fn download_index_part(
|
||||
let index_part_path = conf
|
||||
.metadata_path(timeline_id, tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let part_storage_path = conf
|
||||
.remote_path(&index_part_path)
|
||||
let part_storage_path = storage
|
||||
.remote_object_id(&index_part_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
|
||||
let mut index_part_download = storage.download(&part_storage_path).await?;
|
||||
@@ -208,12 +236,20 @@ pub async fn download_index_part(
|
||||
&mut index_part_bytes,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to download an index part into file {index_part_path:?}"))
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download an index part into file '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
||||
.with_context(|| {
|
||||
format!("Failed to deserialize index part file into file {index_part_path:?}")
|
||||
format!(
|
||||
"Failed to deserialize index part file into file '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -2,9 +2,12 @@
|
||||
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
|
||||
//! remote timeline layers and its metadata.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use remote_storage::RemotePath;
|
||||
use anyhow::{Context, Ok};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
@@ -12,6 +15,33 @@ use crate::tenant::metadata::TimelineMetadata;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// A part of the filesystem path, that needs a root to become a path again.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct RelativePath(String);
|
||||
|
||||
impl RelativePath {
|
||||
/// Attempts to strip off the base from path, producing a relative path or an error.
|
||||
pub fn from_local_path(timeline_path: &Path, path: &Path) -> anyhow::Result<RelativePath> {
|
||||
let relative = path.strip_prefix(timeline_path).with_context(|| {
|
||||
format!(
|
||||
"path '{}' is not relative to base '{}'",
|
||||
path.display(),
|
||||
timeline_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(Self::from_filename(relative))
|
||||
}
|
||||
|
||||
pub fn from_filename(path: &Path) -> RelativePath {
|
||||
RelativePath(path.to_string_lossy().to_string())
|
||||
}
|
||||
|
||||
pub fn to_local_path(&self, timeline_path: &Path) -> PathBuf {
|
||||
timeline_path.join(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
///
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
@@ -67,22 +97,21 @@ pub struct IndexPart {
|
||||
#[serde(default)]
|
||||
version: usize,
|
||||
|
||||
/// Layer names, which are stored on the remote storage.
|
||||
/// Each of the layers present on remote storage.
|
||||
///
|
||||
/// Additional metadata can might exist in `layer_metadata`.
|
||||
pub timeline_layers: HashSet<String>,
|
||||
pub timeline_layers: HashSet<RelativePath>,
|
||||
|
||||
/// FIXME: unused field. This should be removed, but that changes the on-disk format,
|
||||
/// so we need to make sure we're backwards-` (and maybe forwards-) compatible
|
||||
/// First pass is to move it to Optional and the next would be its removal
|
||||
missing_layers: Option<HashSet<String>>,
|
||||
/// so we need to make sure we're backwards- (and maybe forwards-) compatible
|
||||
missing_layers: HashSet<RelativePath>,
|
||||
|
||||
/// Per layer file name metadata, which can be present for a present or missing layer file.
|
||||
/// Per layer file metadata, which can be present for a present or missing layer file.
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
/// that latest version stores.
|
||||
#[serde(default)]
|
||||
pub layer_metadata: HashMap<String, IndexLayerMetadata>,
|
||||
pub layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
|
||||
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated here for convenience.
|
||||
@@ -100,29 +129,23 @@ impl IndexPart {
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
pub fn new(
|
||||
layers_and_metadata: HashMap<RemotePath, LayerFileMetadata>,
|
||||
layers_and_metadata: HashMap<RelativePath, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_bytes: Vec<u8>,
|
||||
) -> Self {
|
||||
let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
|
||||
let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
|
||||
let mut timeline_layers = HashSet::new();
|
||||
let mut layer_metadata = HashMap::new();
|
||||
|
||||
for (remote_path, metadata) in &layers_and_metadata {
|
||||
let metadata = IndexLayerMetadata::from(metadata);
|
||||
match remote_path.object_name() {
|
||||
Some(layer_name) => {
|
||||
timeline_layers.insert(layer_name.to_owned());
|
||||
layer_metadata.insert(layer_name.to_owned(), metadata);
|
||||
}
|
||||
// TODO move this on a type level: we know, that every layer entry does have a name
|
||||
None => panic!("Layer {remote_path:?} has no file name, skipping"),
|
||||
}
|
||||
}
|
||||
separate_paths_and_metadata(
|
||||
&layers_and_metadata,
|
||||
&mut timeline_layers,
|
||||
&mut layer_metadata,
|
||||
);
|
||||
|
||||
Self {
|
||||
version: Self::LATEST_VERSION,
|
||||
timeline_layers,
|
||||
missing_layers: Some(HashSet::new()),
|
||||
missing_layers: HashSet::new(),
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
@@ -148,6 +171,18 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
fn separate_paths_and_metadata(
|
||||
input: &HashMap<RelativePath, LayerFileMetadata>,
|
||||
output: &mut HashSet<RelativePath>,
|
||||
layer_metadata: &mut HashMap<RelativePath, IndexLayerMetadata>,
|
||||
) {
|
||||
for (path, metadata) in input {
|
||||
let metadata = IndexLayerMetadata::from(metadata);
|
||||
layer_metadata.insert(path.clone(), metadata);
|
||||
output.insert(path.clone());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -163,8 +198,8 @@ mod tests {
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 0,
|
||||
timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
|
||||
missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
|
||||
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
|
||||
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
|
||||
layer_metadata: HashMap::default(),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
@@ -191,13 +226,13 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
|
||||
missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
|
||||
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
|
||||
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
|
||||
layer_metadata: HashMap::from([
|
||||
(String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"), IndexLayerMetadata {
|
||||
(RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
|
||||
file_size: Some(25600000),
|
||||
}),
|
||||
(String::from("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
|
||||
(RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: Some(9007199254741001),
|
||||
@@ -210,46 +245,4 @@ mod tests {
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v1_indexpart_is_parsed_with_optional_missing_layers() {
|
||||
let example = r#"{
|
||||
"version":1,
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string()]),
|
||||
layer_metadata: HashMap::from([
|
||||
(
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string(),
|
||||
IndexLayerMetadata {
|
||||
file_size: Some(25600000),
|
||||
}
|
||||
),
|
||||
(
|
||||
"not_a_real_layer_but_adding_coverage".to_string(),
|
||||
IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: Some(9007199254741001),
|
||||
}
|
||||
)
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
missing_layers: None,
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,12 @@
|
||||
//! Helper functions to upload files to remote storage with a RemoteStorage
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use fail::fail_point;
|
||||
use std::path::Path;
|
||||
use tokio::fs;
|
||||
|
||||
use super::index::IndexPart;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::fail_point;
|
||||
use crate::storage_sync::LayerFileMetadata;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -30,9 +30,12 @@ pub(super) async fn upload_index_part<'a>(
|
||||
let index_part_path = conf
|
||||
.metadata_path(timeline_id, tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let storage_path = conf.remote_path(&index_part_path)?;
|
||||
storage
|
||||
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
|
||||
.upload_storage_object(
|
||||
Box::new(index_part_bytes),
|
||||
index_part_size,
|
||||
&index_part_path,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
@@ -41,26 +44,36 @@ pub(super) async fn upload_index_part<'a>(
|
||||
/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
|
||||
///
|
||||
/// On an error, bumps the retries count and reschedules the entire task.
|
||||
pub(super) async fn upload_timeline_layer<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
source_path: &'a Path,
|
||||
known_metadata: &'a LayerFileMetadata,
|
||||
pub(super) async fn upload_timeline_layer(
|
||||
storage: &GenericRemoteStorage,
|
||||
source_path: &Path,
|
||||
known_metadata: &LayerFileMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
fail_point!("before-upload-layer", |_| {
|
||||
bail!("failpoint before-upload-layer")
|
||||
});
|
||||
let storage_path = conf.remote_path(source_path)?;
|
||||
let storage_path = storage.remote_object_id(source_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let source_file = fs::File::open(&source_path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?;
|
||||
let source_file = fs::File::open(&source_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a source file for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let fs_size = source_file
|
||||
.metadata()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to get the source file metadata for layer {source_path:?}")
|
||||
format!(
|
||||
"Failed to get the source file metadata for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})?
|
||||
.len();
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
//! Helper functions to delete files from remote storage with a RemoteStorage
|
||||
use anyhow::Context;
|
||||
use std::path::Path;
|
||||
use tracing::debug;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
|
||||
pub(super) async fn delete_layer<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
local_layer_path: &'a Path,
|
||||
) -> anyhow::Result<()> {
|
||||
crate::fail_point!("before-delete-layer", |_| {
|
||||
anyhow::bail!("failpoint before-delete-layer")
|
||||
});
|
||||
debug!("Deleting layer from remote storage: {local_layer_path:?}",);
|
||||
|
||||
let path_to_delete = conf.remote_path(local_layer_path)?;
|
||||
|
||||
// XXX: If the deletion fails because the object already didn't exist,
|
||||
// it would be good to just issue a warning but consider it success.
|
||||
// https://github.com/neondatabase/neon/issues/2934
|
||||
storage.delete(&path_to_delete).await.with_context(|| {
|
||||
format!("Failed to delete remote layer from storage at {path_to_delete:?}")
|
||||
})
|
||||
}
|
||||
@@ -46,7 +46,6 @@ use std::time::{Duration, Instant};
|
||||
|
||||
use self::metadata::TimelineMetadata;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::fail_point;
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
|
||||
@@ -256,7 +255,7 @@ impl UninitializedTimeline<'_> {
|
||||
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
|
||||
raw_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
|
||||
@@ -442,6 +441,8 @@ struct RemoteStartupData {
|
||||
remote_metadata: TimelineMetadata,
|
||||
}
|
||||
|
||||
/// A repository corresponds to one .neon directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
impl Tenant {
|
||||
/// Yet another helper for timeline initialization.
|
||||
/// Contains common part for `load_local_timeline` and `load_remote_timeline`
|
||||
@@ -572,7 +573,7 @@ impl Tenant {
|
||||
pub fn spawn_attach(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
) -> Arc<Tenant> {
|
||||
// XXX: Attach should provide the config, especially during tenant migration.
|
||||
// See https://github.com/neondatabase/neon/issues/1555
|
||||
@@ -585,7 +586,7 @@ impl Tenant {
|
||||
tenant_conf,
|
||||
wal_redo_manager,
|
||||
tenant_id,
|
||||
Some(remote_storage),
|
||||
Some(remote_storage.clone()),
|
||||
));
|
||||
|
||||
// Do all the hard work in the background
|
||||
@@ -783,7 +784,7 @@ impl Tenant {
|
||||
let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
|
||||
Ok(conf) => conf,
|
||||
Err(e) => {
|
||||
error!("load tenant config failed: {:?}", e);
|
||||
error!("load tenant config failed: {}", e);
|
||||
return Tenant::create_broken_tenant(conf, tenant_id);
|
||||
}
|
||||
};
|
||||
@@ -1202,12 +1203,10 @@ impl Tenant {
|
||||
// compaction runs.
|
||||
let timelines_to_compact = {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timelines_to_compact = timelines
|
||||
timelines
|
||||
.iter()
|
||||
.map(|(timeline_id, timeline)| (*timeline_id, timeline.clone()))
|
||||
.collect::<Vec<_>>();
|
||||
drop(timelines);
|
||||
timelines_to_compact
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
for (timeline_id, timeline) in &timelines_to_compact {
|
||||
@@ -1248,87 +1247,42 @@ impl Tenant {
|
||||
}
|
||||
|
||||
/// Removes timeline-related in-memory data
|
||||
pub async fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> {
|
||||
// Transition the timeline into TimelineState::Stopping.
|
||||
// This should prevent new operations from starting.
|
||||
let timeline = {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
pub fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> {
|
||||
// in order to be retriable detach needs to be idempotent
|
||||
// (or at least to a point that each time the detach is called it can make progress)
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
// Ensure that there are no child timelines **attached to that pageserver**,
|
||||
// because detach removes files, which will break child branches
|
||||
let children_exist = timelines
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
// Ensure that there are no child timelines **attached to that pageserver**,
|
||||
// because detach removes files, which will break child branches
|
||||
let children_exist = timelines
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
|
||||
anyhow::ensure!(
|
||||
!children_exist,
|
||||
"Cannot delete timeline which has child timelines"
|
||||
);
|
||||
let timeline_entry = match timelines.entry(timeline_id) {
|
||||
Entry::Occupied(e) => e,
|
||||
Entry::Vacant(_) => bail!("timeline not found"),
|
||||
};
|
||||
|
||||
let timeline = Arc::clone(timeline_entry.get());
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
|
||||
drop(timelines);
|
||||
timeline
|
||||
anyhow::ensure!(
|
||||
!children_exist,
|
||||
"Cannot delete timeline which has child timelines"
|
||||
);
|
||||
let timeline_entry = match timelines.entry(timeline_id) {
|
||||
Entry::Occupied(e) => e,
|
||||
Entry::Vacant(_) => bail!("timeline not found"),
|
||||
};
|
||||
|
||||
info!("waiting for layer_removal_cs.lock()");
|
||||
// No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change.
|
||||
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
||||
info!("got layer_removal_cs.lock(), deleting layer files");
|
||||
let timeline = timeline_entry.get();
|
||||
timeline.set_state(TimelineState::Paused);
|
||||
|
||||
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
||||
// by the caller.
|
||||
// FIXME: Wait for all tasks, including GC and compaction, that are working on the
|
||||
// timeline, to finish.
|
||||
|
||||
let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
|
||||
// XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
|
||||
// with some layers missing.
|
||||
std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
|
||||
format!(
|
||||
"Failed to remove local timeline directory '{}'",
|
||||
local_timeline_directory.display()
|
||||
)
|
||||
})?;
|
||||
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
|
||||
info!("detach removed files");
|
||||
|
||||
drop(layer_removal_guard);
|
||||
|
||||
// Remove the timeline from the map.
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let children_exist = timelines
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
|
||||
// We already deleted the layer files, so it's probably best to panic.
|
||||
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
|
||||
if children_exist {
|
||||
panic!("Timeline grew children while we removed layer files");
|
||||
}
|
||||
let removed_timeline = timelines.remove(&timeline_id);
|
||||
if removed_timeline.is_none() {
|
||||
// This can legitimately happen if there's a concurrent call to this function.
|
||||
// T1 T2
|
||||
// lock
|
||||
// unlock
|
||||
// lock
|
||||
// unlock
|
||||
// remove files
|
||||
// lock
|
||||
// remove from map
|
||||
// unlock
|
||||
// return
|
||||
// remove files
|
||||
// lock
|
||||
// remove from map observes empty map
|
||||
// unlock
|
||||
// return
|
||||
debug!("concurrent call to this function won the race");
|
||||
}
|
||||
drop(timelines);
|
||||
timeline_entry.remove();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1356,10 +1310,10 @@ impl Tenant {
|
||||
"Could not activate tenant because it is in broken state"
|
||||
));
|
||||
}
|
||||
TenantState::Stopping => {
|
||||
TenantState::Paused => {
|
||||
// The tenant was detached, or system shutdown was requested, while we were
|
||||
// loading or attaching the tenant.
|
||||
info!("Tenant is already in Stopping state, skipping activation");
|
||||
info!("Tenant is already in Paused state, skipping activation");
|
||||
}
|
||||
TenantState::Loading | TenantState::Attaching => {
|
||||
*current_state = TenantState::Active;
|
||||
@@ -1385,16 +1339,16 @@ impl Tenant {
|
||||
result
|
||||
}
|
||||
|
||||
/// Change tenant status to Stopping, to mark that it is being shut down
|
||||
pub fn set_stopping(&self) {
|
||||
/// Change tenant status to paused, to mark that it is being shut down
|
||||
pub fn set_paused(&self) {
|
||||
self.state.send_modify(|current_state| {
|
||||
match *current_state {
|
||||
TenantState::Active | TenantState::Loading | TenantState::Attaching => {
|
||||
*current_state = TenantState::Stopping;
|
||||
*current_state = TenantState::Paused;
|
||||
|
||||
// FIXME: If the tenant is still Loading or Attaching, new timelines
|
||||
// might be created after this. That's harmless, as the Timelines
|
||||
// won't be accessible to anyone, when the Tenant is in Stopping
|
||||
// won't be accessible to anyone, when the Tenant is in Paused
|
||||
// state.
|
||||
let timelines_accessor = self.timelines.lock().unwrap();
|
||||
let not_broken_timelines = timelines_accessor
|
||||
@@ -1405,12 +1359,12 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
TenantState::Broken => {
|
||||
info!("Cannot set tenant to Stopping state, it is already in Broken state");
|
||||
info!("Cannot set tenant to Paused state, it is already in Broken state");
|
||||
}
|
||||
TenantState::Stopping => {
|
||||
TenantState::Paused => {
|
||||
// The tenant was detached, or system shutdown was requested, while we were
|
||||
// loading or attaching the tenant.
|
||||
info!("Tenant is already in Stopping state");
|
||||
info!("Tenant is already in Paused state");
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -1431,10 +1385,10 @@ impl Tenant {
|
||||
// This shouldn't happen either
|
||||
warn!("Tenant is already broken");
|
||||
}
|
||||
TenantState::Stopping => {
|
||||
TenantState::Paused => {
|
||||
// This shouldn't happen either
|
||||
*current_state = TenantState::Broken;
|
||||
warn!("Marking Stopping tenant as Broken");
|
||||
warn!("Marking Paused tenant as Broken");
|
||||
}
|
||||
TenantState::Loading | TenantState::Attaching => {
|
||||
*current_state = TenantState::Broken;
|
||||
@@ -1459,7 +1413,7 @@ impl Tenant {
|
||||
TenantState::Active { .. } => {
|
||||
return Ok(());
|
||||
}
|
||||
TenantState::Broken | TenantState::Stopping => {
|
||||
TenantState::Broken | TenantState::Paused => {
|
||||
// There's no chance the tenant can transition back into ::Active
|
||||
anyhow::bail!(
|
||||
"Tenant {} will not become active. Current state: {:?}",
|
||||
@@ -2093,18 +2047,17 @@ impl Tenant {
|
||||
format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
|
||||
// Flush the new layer files to disk, before we mark the timeline as available to
|
||||
// the outside world.
|
||||
//
|
||||
// Thus spawn flush loop manually and skip flush_loop setup in initialize_with_lock
|
||||
// Flush loop needs to be spawned in order for checkpoint to be able to flush.
|
||||
// We want to run proper checkpoint before we mark timeline as available to outside world
|
||||
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
|
||||
unfinished_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
anyhow::bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
|
||||
unfinished_timeline
|
||||
.checkpoint(CheckpointConfig::Flush).await
|
||||
.checkpoint(CheckpointConfig::Forced).await
|
||||
.with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;
|
||||
|
||||
let timeline = {
|
||||
@@ -2193,7 +2146,7 @@ impl Tenant {
|
||||
.context("Failed to create timeline data structure")?;
|
||||
crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?;
|
||||
|
||||
fail_point!("after-timeline-uninit-mark-creation", |_| {
|
||||
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
|
||||
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
|
||||
});
|
||||
|
||||
@@ -2382,7 +2335,7 @@ fn try_create_target_tenant_dir(
|
||||
temporary_tenant_timelines_dir.display()
|
||||
)
|
||||
})?;
|
||||
fail_point!("tenant-creation-before-tmp-rename", |_| {
|
||||
fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
|
||||
anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
|
||||
});
|
||||
|
||||
@@ -2602,11 +2555,7 @@ pub mod harness {
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// Disable automatic GC and compaction to make the unit tests more deterministic.
|
||||
// The tests perform them manually if needed.
|
||||
let mut tenant_conf = TenantConf::dummy_conf();
|
||||
tenant_conf.gc_period = Duration::ZERO;
|
||||
tenant_conf.compaction_period = Duration::ZERO;
|
||||
let tenant_conf = TenantConf::dummy_conf();
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
@@ -2670,7 +2619,7 @@ pub mod harness {
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
_pg_version: u32,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
|
||||
@@ -30,14 +30,15 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{
|
||||
DropNotify, Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
@@ -191,6 +192,8 @@ pub struct DeltaLayerInner {
|
||||
|
||||
/// Reader object for reading blocks from the file. (None if not loaded yet)
|
||||
file: Option<FileBlockReader<VirtualFile>>,
|
||||
|
||||
drop_watch: Option<DropNotify>,
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
@@ -327,10 +330,13 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
fn drop_notify(&self) -> DropNotify {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner
|
||||
.drop_watch
|
||||
.get_or_insert_with(|| DropNotify::new())
|
||||
.clone()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
@@ -551,6 +557,7 @@ impl DeltaLayer {
|
||||
file: None,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
drop_watch: None,
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -578,6 +585,7 @@ impl DeltaLayer {
|
||||
file: None,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
drop_watch: None,
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -743,6 +751,7 @@ impl DeltaLayerWriterInner {
|
||||
file: None,
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
drop_watch: None,
|
||||
}),
|
||||
};
|
||||
|
||||
|
||||
@@ -26,7 +26,9 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::filename::{ImageFileName, PathOrConf};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{
|
||||
DropNotify, Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
@@ -34,7 +36,6 @@ use bytes::Bytes;
|
||||
use hex;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
@@ -117,6 +118,8 @@ pub struct ImageLayerInner {
|
||||
|
||||
/// Reader object for reading blocks from the file. (None if not loaded yet)
|
||||
file: Option<FileBlockReader<VirtualFile>>,
|
||||
|
||||
drop_watch: Option<DropNotify>,
|
||||
}
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
@@ -184,10 +187,13 @@ impl Layer for ImageLayer {
|
||||
todo!();
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
fn drop_notify(&self) -> DropNotify {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner
|
||||
.drop_watch
|
||||
.get_or_insert_with(|| DropNotify::new())
|
||||
.clone()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
@@ -351,6 +357,7 @@ impl ImageLayer {
|
||||
file: None,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
drop_watch: None,
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -378,6 +385,7 @@ impl ImageLayer {
|
||||
loaded: false,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
drop_watch: None,
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -532,6 +540,7 @@ impl ImageLayerWriterInner {
|
||||
file: None,
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
drop_watch: None,
|
||||
}),
|
||||
};
|
||||
|
||||
|
||||
@@ -10,9 +10,11 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{
|
||||
DropNotify, Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::walrecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use anyhow::{ensure, Result};
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use tracing::*;
|
||||
@@ -172,8 +174,8 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
/// Nothing to do here. When you drop the last reference to the layer, it will
|
||||
/// be deallocated.
|
||||
fn delete(&self) -> Result<()> {
|
||||
bail!("can't delete an InMemoryLayer")
|
||||
fn drop_notify(&self) -> DropNotify {
|
||||
panic!("can't delete an InMemoryLayer")
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
|
||||
@@ -145,9 +145,31 @@ pub trait Layer: Send + Sync {
|
||||
panic!("Not implemented")
|
||||
}
|
||||
|
||||
/// Permanently remove this layer from disk.
|
||||
fn delete(&self) -> Result<()>;
|
||||
fn drop_notify(&self) -> DropNotify;
|
||||
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
fn dump(&self, verbose: bool) -> Result<()>;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DropNotify(std::sync::Arc<tokio::sync::Notify>);
|
||||
|
||||
impl DropNotify {
|
||||
pub fn new() -> Self {
|
||||
DropNotify(std::sync::Arc::new(tokio::sync::Notify::new()))
|
||||
}
|
||||
|
||||
pub async fn dropped(&self) {
|
||||
self.0.notified().await
|
||||
}
|
||||
|
||||
pub fn notify_waiters(&self) {
|
||||
self.0.notify_waiters();
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DropNotify {
|
||||
fn drop(&mut self) {
|
||||
self.0.notify_waiters();
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -51,7 +51,6 @@ pub struct TenantConf {
|
||||
// This parameter determines L1 layer file size.
|
||||
pub compaction_target_size: u64,
|
||||
// How often to check if there's compaction work to be done.
|
||||
// Duration::ZERO means automatic compaction is disabled.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub compaction_period: Duration,
|
||||
// Level0 delta layer threshold for compaction.
|
||||
@@ -62,7 +61,6 @@ pub struct TenantConf {
|
||||
// Page versions older than this are garbage collected away.
|
||||
pub gc_horizon: u64,
|
||||
// Interval at which garbage collection is triggered.
|
||||
// Duration::ZERO means automatic GC is disabled
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub gc_period: Duration,
|
||||
// Delta layer churn threshold to create L1 image layers.
|
||||
@@ -185,9 +183,6 @@ impl TenantConfOpt {
|
||||
if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag {
|
||||
self.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
||||
}
|
||||
if let Some(trace_read_requests) = other.trace_read_requests {
|
||||
self.trace_read_requests = Some(trace_read_requests);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,62 +1,75 @@
|
||||
//! This module acts as a switchboard to access different repositories managed by this
|
||||
//! page server.
|
||||
|
||||
use std::collections::{hash_map, HashMap};
|
||||
use std::collections::hash_map;
|
||||
use std::ffi::OsStr;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use tokio::fs;
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::*;
|
||||
|
||||
use pageserver_api::models::TimelineGcRequest;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::crashsafe;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::IGNORED_TENANT_FILE_NAME;
|
||||
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::http::error::ApiError;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
|
||||
Lazy::new(|| RwLock::new(HashMap::new()));
|
||||
mod tenants_state {
|
||||
use once_cell::sync::Lazy;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard},
|
||||
};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::tenant::Tenant;
|
||||
|
||||
static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
|
||||
Lazy::new(|| RwLock::new(HashMap::new()));
|
||||
|
||||
pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
|
||||
TENANTS
|
||||
.read()
|
||||
.expect("Failed to read() tenants lock, it got poisoned")
|
||||
}
|
||||
|
||||
pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
|
||||
TENANTS
|
||||
.write()
|
||||
.expect("Failed to write() tenants lock, it got poisoned")
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize repositories with locally available timelines.
|
||||
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
||||
/// are scheduled for download and added to the tenant once download is completed.
|
||||
#[instrument(skip(conf, remote_storage))]
|
||||
pub async fn init_tenant_mgr(
|
||||
pub fn init_tenant_mgr(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<()> {
|
||||
let _entered = info_span!("init_tenant_mgr").entered();
|
||||
|
||||
// Scan local filesystem for attached tenants
|
||||
let mut number_of_tenants = 0;
|
||||
let tenants_dir = conf.tenants_path();
|
||||
|
||||
let mut dir_entries = fs::read_dir(&tenants_dir)
|
||||
.await
|
||||
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
|
||||
|
||||
loop {
|
||||
match dir_entries.next_entry().await {
|
||||
Ok(None) => break,
|
||||
Ok(Some(dir_entry)) => {
|
||||
for dir_entry in std::fs::read_dir(&tenants_dir)
|
||||
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
|
||||
{
|
||||
match &dir_entry {
|
||||
Ok(dir_entry) => {
|
||||
let tenant_dir_path = dir_entry.path();
|
||||
if crate::is_temporary(&tenant_dir_path) {
|
||||
info!(
|
||||
"Found temporary tenant directory, removing: {}",
|
||||
tenant_dir_path.display()
|
||||
);
|
||||
if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
|
||||
if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
|
||||
error!(
|
||||
"Failed to remove temporary directory '{}': {:?}",
|
||||
tenant_dir_path.display(),
|
||||
@@ -64,38 +77,27 @@ pub async fn init_tenant_mgr(
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// This case happens if we crash during attach before creating the attach marker file
|
||||
let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
|
||||
format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
|
||||
})?;
|
||||
if is_empty {
|
||||
info!("removing empty tenant directory {tenant_dir_path:?}");
|
||||
if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
|
||||
error!(
|
||||
"Failed to remove empty tenant directory '{}': {e:#}",
|
||||
tenant_dir_path.display()
|
||||
)
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
|
||||
if tenant_ignore_mark_file.exists() {
|
||||
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
|
||||
continue;
|
||||
}
|
||||
|
||||
match schedule_local_tenant_processing(
|
||||
conf,
|
||||
&tenant_dir_path,
|
||||
remote_storage.clone(),
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
TENANTS.write().await.insert(tenant.tenant_id(), tenant);
|
||||
match load_local_tenant(conf, &tenant_dir_path, remote_storage.clone()) {
|
||||
Ok(Some(tenant)) => {
|
||||
tenants_state::write_tenants().insert(tenant.tenant_id(), tenant);
|
||||
number_of_tenants += 1;
|
||||
}
|
||||
Ok(None) => {
|
||||
// This case happens if we crash during attach before creating the attach marker file
|
||||
if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
|
||||
error!(
|
||||
"Failed to remove empty tenant directory '{}': {e:#}",
|
||||
tenant_dir_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
|
||||
error!(
|
||||
"Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
|
||||
tenants_dir.display(),
|
||||
dir_entry,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -105,7 +107,10 @@ pub async fn init_tenant_mgr(
|
||||
// here, the pageserver startup fails altogether, causing outage for *all*
|
||||
// tenants. That seems worse.
|
||||
error!(
|
||||
"Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}"
|
||||
"Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
|
||||
dir_entry,
|
||||
tenants_dir.display(),
|
||||
e,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -115,45 +120,34 @@ pub async fn init_tenant_mgr(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn schedule_local_tenant_processing(
|
||||
fn load_local_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
anyhow::ensure!(
|
||||
tenant_path.is_dir(),
|
||||
"Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
|
||||
);
|
||||
anyhow::ensure!(
|
||||
!crate::is_temporary(tenant_path),
|
||||
"Cannot load tenant from temporary path {tenant_path:?}"
|
||||
);
|
||||
anyhow::ensure!(
|
||||
!tenant_path.is_empty_dir().with_context(|| {
|
||||
format!("Failed to check whether {tenant_path:?} is an empty dir")
|
||||
})?,
|
||||
"Cannot load tenant from empty directory {tenant_path:?}"
|
||||
);
|
||||
) -> anyhow::Result<Option<Arc<Tenant>>> {
|
||||
if !tenant_path.is_dir() {
|
||||
anyhow::bail!("tenant_path is not a directory: {tenant_path:?}")
|
||||
}
|
||||
|
||||
let is_empty = tenant_path
|
||||
.is_empty_dir()
|
||||
.context("check whether tenant_path is an empty dir")?;
|
||||
if is_empty {
|
||||
info!("skipping empty tenant directory {tenant_path:?}");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TenantId>()
|
||||
.with_context(|| {
|
||||
format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
|
||||
})?;
|
||||
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
anyhow::ensure!(
|
||||
!conf.tenant_ignore_mark_file_path(tenant_id).exists(),
|
||||
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
||||
);
|
||||
.context("Could not parse tenant id out of the tenant dir name")?;
|
||||
|
||||
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
||||
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
Tenant::spawn_attach(conf, tenant_id, remote_storage)
|
||||
Tenant::spawn_attach(conf, tenant_id, &remote_storage)
|
||||
} else {
|
||||
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
|
||||
Tenant::create_broken_tenant(conf, tenant_id)
|
||||
@@ -163,7 +157,7 @@ pub fn schedule_local_tenant_processing(
|
||||
// Start loading the tenant into memory. It will initially be in Loading state.
|
||||
Tenant::spawn_load(conf, tenant_id, remote_storage)
|
||||
};
|
||||
Ok(tenant)
|
||||
Ok(Some(tenant))
|
||||
}
|
||||
|
||||
///
|
||||
@@ -171,12 +165,12 @@ pub fn schedule_local_tenant_processing(
|
||||
///
|
||||
pub async fn shutdown_all_tenants() {
|
||||
let tenants_to_shut_down = {
|
||||
let mut m = TENANTS.write().await;
|
||||
let mut m = tenants_state::write_tenants();
|
||||
let mut tenants_to_shut_down = Vec::with_capacity(m.len());
|
||||
for (_, tenant) in m.drain() {
|
||||
if tenant.is_active() {
|
||||
// updates tenant state, forbidding new GC and compaction iterations from starting
|
||||
tenant.set_stopping();
|
||||
tenant.set_paused();
|
||||
tenants_to_shut_down.push(tenant)
|
||||
}
|
||||
}
|
||||
@@ -205,13 +199,13 @@ pub async fn shutdown_all_tenants() {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn create_tenant(
|
||||
pub fn create_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<Option<Arc<Tenant>>> {
|
||||
match TENANTS.write().await.entry(tenant_id) {
|
||||
match tenants_state::write_tenants().entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(_) => {
|
||||
debug!("tenant {tenant_id} already exists");
|
||||
Ok(None)
|
||||
@@ -221,36 +215,44 @@ pub async fn create_tenant(
|
||||
// If this section ever becomes contentious, introduce a new `TenantState::Creating`.
|
||||
let tenant_directory =
|
||||
super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
|
||||
let created_tenant =
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
|
||||
let crated_tenant_id = created_tenant.tenant_id();
|
||||
anyhow::ensure!(
|
||||
tenant_id == crated_tenant_id,
|
||||
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
|
||||
);
|
||||
v.insert(Arc::clone(&created_tenant));
|
||||
Ok(Some(created_tenant))
|
||||
let created_tenant = load_local_tenant(conf, &tenant_directory, remote_storage)?;
|
||||
match created_tenant {
|
||||
None => {
|
||||
// We get None in case the directory is empty.
|
||||
// This shouldn't happen here, because we just created the directory.
|
||||
// So, skip any cleanup work for now, we don't know how we reached this state.
|
||||
anyhow::bail!("we just created the tenant directory, it can't be empty");
|
||||
}
|
||||
Some(tenant) => {
|
||||
anyhow::ensure!(
|
||||
tenant_id == tenant.tenant_id(),
|
||||
"loaded created tenant has unexpected tenant id (expect {} != actual {})",
|
||||
tenant_id,
|
||||
tenant.tenant_id()
|
||||
);
|
||||
v.insert(Arc::clone(&tenant));
|
||||
Ok(Some(tenant))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn update_tenant_config(
|
||||
pub fn update_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("configuring tenant {tenant_id}");
|
||||
get_tenant(tenant_id, true)
|
||||
.await?
|
||||
.update_tenant_config(tenant_conf);
|
||||
get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf);
|
||||
Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
|
||||
let m = TENANTS.read().await;
|
||||
pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
|
||||
let m = tenants_state::read_tenants();
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
|
||||
@@ -286,9 +288,9 @@ pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> an
|
||||
info!("waiting for timeline tasks to shutdown");
|
||||
task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
|
||||
info!("timeline task shutdown completed");
|
||||
match get_tenant(tenant_id, true).await {
|
||||
match get_tenant(tenant_id, true) {
|
||||
Ok(tenant) => {
|
||||
tenant.delete_timeline(timeline_id).await?;
|
||||
tenant.delete_timeline(timeline_id)?;
|
||||
}
|
||||
Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
|
||||
}
|
||||
@@ -300,67 +302,40 @@ pub async fn detach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
remove_tenant_from_memory(tenant_id, async {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id);
|
||||
fs::remove_dir_all(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to remove local tenant directory {local_tenant_directory:?}")
|
||||
})?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
let tenant = match {
|
||||
let mut tenants_accessor = tenants_state::write_tenants();
|
||||
tenants_accessor.remove(&tenant_id)
|
||||
} {
|
||||
Some(tenant) => tenant,
|
||||
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
|
||||
};
|
||||
|
||||
pub async fn load_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<()> {
|
||||
run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
|
||||
let tenant_path = conf.tenant_path(&tenant_id);
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
if tenant_ignore_mark.exists() {
|
||||
std::fs::remove_file(&tenant_ignore_mark)
|
||||
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
|
||||
}
|
||||
tenant.set_paused();
|
||||
// shutdown all tenant and timeline tasks: gc, compaction, page service)
|
||||
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
|
||||
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage)
|
||||
.with_context(|| {
|
||||
format!("Failed to schedule tenant processing in path {tenant_path:?}")
|
||||
})?;
|
||||
// If removal fails there will be no way to successfully retry detach,
|
||||
// because the tenant no longer exists in the in-memory map. And it needs to be removed from it
|
||||
// before we remove files, because it contains references to tenant
|
||||
// which references ephemeral files which are deleted on drop. So if we keep these references,
|
||||
// we will attempt to remove files which no longer exist. This can be fixed by having shutdown
|
||||
// mechanism for tenant that will clean temporary data to avoid any references to ephemeral files
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id);
|
||||
fs::remove_dir_all(&local_tenant_directory).with_context(|| {
|
||||
format!(
|
||||
"Failed to remove local tenant directory '{}'",
|
||||
local_tenant_directory.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
vacant_entry.insert(new_tenant);
|
||||
Ok(())
|
||||
}).await
|
||||
}
|
||||
|
||||
pub async fn ignore_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
remove_tenant_from_memory(tenant_id, async {
|
||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
fs::File::create(&ignore_mark_file)
|
||||
.await
|
||||
.context("Failed to create ignore mark file")
|
||||
.and_then(|_| {
|
||||
crashsafe::fsync_file_and_parent(&ignore_mark_file)
|
||||
.context("Failed to fsync ignore mark file")
|
||||
})
|
||||
.with_context(|| format!("Failed to crate ignore mark for tenant {tenant_id}"))?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Get list of tenants, for the mgmt API
|
||||
///
|
||||
pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
|
||||
TENANTS
|
||||
.read()
|
||||
.await
|
||||
pub fn list_tenants() -> Vec<(TenantId, TenantState)> {
|
||||
tenants_state::read_tenants()
|
||||
.iter()
|
||||
.map(|(id, tenant)| (*id, tenant.current_state()))
|
||||
.collect()
|
||||
@@ -373,102 +348,42 @@ pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
|
||||
pub async fn attach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<()> {
|
||||
run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
|
||||
let tenant_path = conf.tenant_path(&tenant_id);
|
||||
anyhow::ensure!(
|
||||
!tenant_path.exists(),
|
||||
"Cannot attach tenant {tenant_id}, local tenant directory already exists"
|
||||
);
|
||||
|
||||
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
|
||||
vacant_entry.insert(tenant);
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn run_if_no_tenant_in_memory<F, V>(tenant_id: TenantId, run: F) -> anyhow::Result<V>
|
||||
where
|
||||
F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
|
||||
{
|
||||
match TENANTS.write().await.entry(tenant_id) {
|
||||
match tenants_state::write_tenants().entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(e) => {
|
||||
anyhow::bail!(
|
||||
"tenant {tenant_id} already exists, state: {:?}",
|
||||
e.get().current_state()
|
||||
)
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => run(v),
|
||||
}
|
||||
}
|
||||
|
||||
/// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise.
|
||||
/// Allows to remove other tenant resources manually, via `tenant_cleanup`.
|
||||
/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
|
||||
/// operation would be needed to remove it.
|
||||
async fn remove_tenant_from_memory<V, F>(
|
||||
tenant_id: TenantId,
|
||||
tenant_cleanup: F,
|
||||
) -> anyhow::Result<V>
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<V>>,
|
||||
{
|
||||
// It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
|
||||
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
|
||||
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
|
||||
// avoid holding the lock for the entire process.
|
||||
{
|
||||
let tenants_accessor = TENANTS.write().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => match tenant.current_state() {
|
||||
TenantState::Attaching
|
||||
| TenantState::Loading
|
||||
| TenantState::Broken
|
||||
| TenantState::Active => tenant.set_stopping(),
|
||||
TenantState::Stopping => {
|
||||
anyhow::bail!("Tenant {tenant_id} is stopping already")
|
||||
// Cannot attach a tenant that already exists. The error message depends on
|
||||
// the state it's in.
|
||||
match e.get().current_state() {
|
||||
TenantState::Attaching => {
|
||||
anyhow::bail!("tenant {tenant_id} attach is already in progress")
|
||||
}
|
||||
current_state => {
|
||||
anyhow::bail!("tenant already exists, current state: {current_state:?}")
|
||||
}
|
||||
},
|
||||
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
|
||||
}
|
||||
}
|
||||
|
||||
// shutdown all tenant and timeline tasks: gc, compaction, page service)
|
||||
// No new tasks will be started for this tenant because it's in `Stopping` state.
|
||||
// Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
|
||||
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
|
||||
|
||||
match tenant_cleanup
|
||||
.await
|
||||
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
|
||||
{
|
||||
Ok(hook_value) => {
|
||||
let mut tenants_accessor = TENANTS.write().await;
|
||||
if tenants_accessor.remove(&tenant_id).is_none() {
|
||||
warn!("Tenant {tenant_id} got removed from memory before operation finished");
|
||||
}
|
||||
Ok(hook_value)
|
||||
}
|
||||
Err(e) => {
|
||||
let tenants_accessor = TENANTS.read().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => tenant.set_broken(),
|
||||
None => warn!("Tenant {tenant_id} got removed from memory"),
|
||||
}
|
||||
Err(e)
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
|
||||
v.insert(tenant);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn immediate_gc(
|
||||
#[cfg(feature = "testing")]
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn immediate_gc(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
gc_req: TimelineGcRequest,
|
||||
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
|
||||
let guard = TENANTS.read().await;
|
||||
let guard = tenants_state::read_tenants();
|
||||
|
||||
let tenant = guard
|
||||
.get(&tenant_id)
|
||||
@@ -490,7 +405,7 @@ pub async fn immediate_gc(
|
||||
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
crate::fail_point!("immediate_gc_task_pre");
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
|
||||
@@ -7,12 +7,26 @@ use std::time::Duration;
|
||||
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use crate::tenant_mgr;
|
||||
use tracing::*;
|
||||
use utils::id::TenantId;
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn start_background_loops(tenant_id: TenantId) {
|
||||
// Do not start the background loops.
|
||||
// Right now, in tests, Tenant is only created by TenantHarness,
|
||||
// and all tests that use TenantHarness assume that there are
|
||||
// no background loops that do compaction and GC. If they want it
|
||||
// to happen, they call the corresponding functions directly.
|
||||
//
|
||||
// XXX replace this with a TenantConfigRequest flag that is
|
||||
// also usable by tests, see https://github.com/neondatabase/neon/issues/2917
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
#[cfg(not(test))]
|
||||
pub fn start_background_loops(tenant_id: TenantId) {
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
@@ -66,17 +80,13 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
},
|
||||
};
|
||||
|
||||
// Run blocking part of the task
|
||||
|
||||
// Run compaction
|
||||
let mut sleep_duration = tenant.get_compaction_period();
|
||||
if sleep_duration == Duration::ZERO {
|
||||
info!("automatic compaction is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration().await {
|
||||
sleep_duration = wait_duration;
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
}
|
||||
if let Err(e) = tenant.compaction_iteration().await {
|
||||
sleep_duration = wait_duration;
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
}
|
||||
|
||||
// Sleep
|
||||
@@ -117,21 +127,15 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
},
|
||||
};
|
||||
|
||||
// Run gc
|
||||
let gc_period = tenant.get_gc_period();
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let mut sleep_duration = gc_period;
|
||||
if sleep_duration == Duration::ZERO {
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
// Run gc
|
||||
if gc_horizon > 0 {
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
|
||||
{
|
||||
sleep_duration = wait_duration;
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
}
|
||||
if gc_horizon > 0 {
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
|
||||
{
|
||||
sleep_duration = wait_duration;
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,7 +159,7 @@ async fn wait_for_active_tenant(
|
||||
wait: Duration,
|
||||
) -> ControlFlow<(), Arc<Tenant>> {
|
||||
let tenant = loop {
|
||||
match tenant_mgr::get_tenant(tenant_id, false).await {
|
||||
match tenant_mgr::get_tenant(tenant_id, false) {
|
||||
Ok(tenant) => break tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to get a tenant {tenant_id}: {e:#}");
|
||||
|
||||
@@ -214,7 +214,7 @@ async fn connection_manager_loop_step(
|
||||
match new_state {
|
||||
// we're already active as walreceiver, no need to reactivate
|
||||
TimelineState::Active => continue,
|
||||
TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state),
|
||||
TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return ControlFlow::Continue(new_state),
|
||||
}
|
||||
}
|
||||
Err(_sender_dropped_error) => return ControlFlow::Break(()),
|
||||
|
||||
@@ -9,6 +9,7 @@ use std::{
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use bytes::BytesMut;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use fail::fail_point;
|
||||
use futures::StreamExt;
|
||||
use postgres::{SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::v14::xlog_utils::normalize_lsn;
|
||||
@@ -19,7 +20,6 @@ use tokio::{pin, select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
use crate::fail_point;
|
||||
use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
|
||||
use crate::{
|
||||
task_mgr,
|
||||
|
||||
@@ -84,7 +84,7 @@ pub trait WalRedoManager: Send + Sync {
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> Result<Bytes, WalRedoError>;
|
||||
@@ -147,7 +147,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
@@ -156,8 +156,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
return Err(WalRedoError::InvalidRequest);
|
||||
}
|
||||
|
||||
let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
|
||||
let mut img = base_img.map(|p| p.1);
|
||||
let mut img: Option<Bytes> = base_img;
|
||||
let mut batch_neon = can_apply_in_neon(&records[0].1);
|
||||
let mut batch_start = 0;
|
||||
for i in 1..records.len() {
|
||||
@@ -171,7 +170,6 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
key,
|
||||
lsn,
|
||||
img,
|
||||
base_img_lsn,
|
||||
&records[batch_start..i],
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
@@ -191,7 +189,6 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
key,
|
||||
lsn,
|
||||
img,
|
||||
base_img_lsn,
|
||||
&records[batch_start..],
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
@@ -226,13 +223,11 @@ impl PostgresRedoManager {
|
||||
///
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn apply_batch_postgres(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
base_img_lsn: Lsn,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
@@ -287,12 +282,9 @@ impl PostgresRedoManager {
|
||||
// next request will launch a new one.
|
||||
if result.is_err() {
|
||||
error!(
|
||||
"error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
|
||||
"error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}",
|
||||
records.len(),
|
||||
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
nbytes,
|
||||
base_img_lsn,
|
||||
lsn
|
||||
);
|
||||
let process = process_guard.take().unwrap();
|
||||
@@ -930,7 +922,8 @@ impl NoLeakChild {
|
||||
|
||||
match child.wait() {
|
||||
Ok(exit_status) => {
|
||||
info!(exit_status = %exit_status, "wait successful");
|
||||
// log at error level since .kill() is something we only do on errors ATM
|
||||
error!(exit_status = %exit_status, "wait successful");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
|
||||
|
||||
@@ -464,12 +464,12 @@ pg_init_libpagestore(void)
|
||||
NULL, NULL, NULL);
|
||||
DefineCustomIntVariable("neon.readahead_buffer_size",
|
||||
"number of prefetches to buffer",
|
||||
"This buffer is used to hold and manage prefetched "
|
||||
"data; so it is important that this buffer is at "
|
||||
"least as large as the configured value of all "
|
||||
"tablespaces' effective_io_concurrency and "
|
||||
"maintenance_io_concurrency, and your sessions' "
|
||||
"values for these settings.",
|
||||
"This buffer is used to store prefetched data; so "
|
||||
"it is important that this buffer is at least as "
|
||||
"large as the configured value of all tablespaces' "
|
||||
"effective_io_concurrency and maintenance_io_concurrency, "
|
||||
"your sessions' values of these, and the value for "
|
||||
"seqscan_prefetch_buffers.",
|
||||
&readahead_buffer_size,
|
||||
128, 16, 1024,
|
||||
PGC_USERSET,
|
||||
|
||||
@@ -242,14 +242,6 @@ PrefetchState *MyPState;
|
||||
) \
|
||||
)
|
||||
|
||||
#define ReceiveBufferNeedsCompaction() (\
|
||||
(MyPState->n_responses_buffered / 8) < ( \
|
||||
MyPState->ring_receive - \
|
||||
MyPState->ring_last - \
|
||||
MyPState->n_responses_buffered \
|
||||
) \
|
||||
)
|
||||
|
||||
int n_prefetch_hits = 0;
|
||||
int n_prefetch_misses = 0;
|
||||
int n_prefetch_missed_caches = 0;
|
||||
@@ -257,99 +249,17 @@ int n_prefetch_dupes = 0;
|
||||
|
||||
XLogRecPtr prefetch_lsn = 0;
|
||||
|
||||
static bool compact_prefetch_buffers(void);
|
||||
static void consume_prefetch_responses(void);
|
||||
static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
|
||||
static bool prefetch_read(PrefetchRequest *slot);
|
||||
static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
|
||||
static bool prefetch_wait_for(uint64 ring_index);
|
||||
static void prefetch_cleanup_trailing_unused(void);
|
||||
static void prefetch_cleanup(void);
|
||||
static inline void prefetch_set_unused(uint64 ring_index);
|
||||
|
||||
static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode,
|
||||
ForkNumber forknum, BlockNumber blkno);
|
||||
|
||||
static bool
|
||||
compact_prefetch_buffers(void)
|
||||
{
|
||||
uint64 empty_ring_index = MyPState->ring_last;
|
||||
uint64 search_ring_index = MyPState->ring_receive;
|
||||
int n_moved = 0;
|
||||
|
||||
if (MyPState->ring_receive == MyPState->ring_last)
|
||||
return false;
|
||||
|
||||
while (search_ring_index > MyPState->ring_last)
|
||||
{
|
||||
search_ring_index--;
|
||||
if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED)
|
||||
{
|
||||
empty_ring_index = search_ring_index;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Here we have established:
|
||||
* slots < search_ring_index may be unused (not scanned)
|
||||
* slots >= search_ring_index and <= empty_ring_index are unused
|
||||
* slots > empty_ring_index are in use, or outside our buffer's range.
|
||||
*
|
||||
* Therefore, there is a gap of at least one unused items between
|
||||
* search_ring_index and empty_ring_index, which grows as we hit
|
||||
* more unused items while moving backwards through the array.
|
||||
*/
|
||||
|
||||
while (search_ring_index > MyPState->ring_last)
|
||||
{
|
||||
PrefetchRequest *source_slot;
|
||||
PrefetchRequest *target_slot;
|
||||
bool found;
|
||||
|
||||
search_ring_index--;
|
||||
|
||||
source_slot = GetPrfSlot(search_ring_index);
|
||||
|
||||
if (source_slot->status == PRFS_UNUSED)
|
||||
continue;
|
||||
|
||||
target_slot = GetPrfSlot(empty_ring_index);
|
||||
|
||||
Assert(source_slot->status == PRFS_RECEIVED);
|
||||
Assert(target_slot->status == PRFS_UNUSED);
|
||||
|
||||
target_slot->buftag = source_slot->buftag;
|
||||
target_slot->status = source_slot->status;
|
||||
target_slot->response = source_slot->response;
|
||||
target_slot->effective_request_lsn = source_slot->effective_request_lsn;
|
||||
target_slot->my_ring_index = empty_ring_index;
|
||||
|
||||
prfh_delete(MyPState->prf_hash, source_slot);
|
||||
prfh_insert(MyPState->prf_hash, target_slot, &found);
|
||||
|
||||
Assert(!found);
|
||||
|
||||
/* Adjust the location of our known-empty slot */
|
||||
empty_ring_index--;
|
||||
|
||||
source_slot->status = PRFS_UNUSED;
|
||||
source_slot->buftag = (BufferTag) {0};
|
||||
source_slot->response = NULL;
|
||||
source_slot->my_ring_index = 0;
|
||||
source_slot->effective_request_lsn = 0;
|
||||
|
||||
n_moved++;
|
||||
}
|
||||
|
||||
if (MyPState->ring_last != empty_ring_index)
|
||||
{
|
||||
MyPState->ring_last = empty_ring_index;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
readahead_buffer_resize(int newsize, void *extra)
|
||||
{
|
||||
@@ -357,7 +267,7 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
nfree = newsize;
|
||||
PrefetchState *newPState;
|
||||
Size newprfs_size = offsetof(PrefetchState, prf_buffer) + (
|
||||
sizeof(PrefetchRequest) * newsize
|
||||
sizeof(PrefetchRequest) * readahead_buffer_size
|
||||
);
|
||||
|
||||
/* don't try to re-initialize if we haven't initialized yet */
|
||||
@@ -413,7 +323,7 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
prfh_insert(newPState->prf_hash, newslot, &found);
|
||||
|
||||
Assert(!found);
|
||||
|
||||
|
||||
switch (newslot->status)
|
||||
{
|
||||
case PRFS_UNUSED:
|
||||
@@ -460,7 +370,7 @@ consume_prefetch_responses(void)
|
||||
}
|
||||
|
||||
static void
|
||||
prefetch_cleanup_trailing_unused(void)
|
||||
prefetch_cleanup(void)
|
||||
{
|
||||
uint64 ring_index;
|
||||
PrefetchRequest *slot;
|
||||
@@ -621,10 +531,7 @@ prefetch_set_unused(uint64 ring_index)
|
||||
|
||||
/* run cleanup if we're holding back ring_last */
|
||||
if (MyPState->ring_last == ring_index)
|
||||
prefetch_cleanup_trailing_unused();
|
||||
/* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */
|
||||
else if (ReceiveBufferNeedsCompaction())
|
||||
compact_prefetch_buffers();
|
||||
prefetch_cleanup();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -795,31 +702,20 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
||||
|
||||
Assert(slot->status != PRFS_UNUSED);
|
||||
|
||||
/*
|
||||
* If there is good reason to run compaction on the prefetch buffers,
|
||||
* try to do that.
|
||||
*/
|
||||
if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
|
||||
/* We have the slot for ring_last, so that must still be in progress */
|
||||
switch (slot->status)
|
||||
{
|
||||
Assert(slot->status == PRFS_UNUSED);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* We have the slot for ring_last, so that must still be in progress */
|
||||
switch (slot->status)
|
||||
{
|
||||
case PRFS_REQUESTED:
|
||||
Assert(MyPState->ring_receive == cleanup_index);
|
||||
prefetch_wait_for(cleanup_index);
|
||||
prefetch_set_unused(cleanup_index);
|
||||
break;
|
||||
case PRFS_RECEIVED:
|
||||
case PRFS_TAG_REMAINS:
|
||||
prefetch_set_unused(cleanup_index);
|
||||
break;
|
||||
default:
|
||||
pg_unreachable();
|
||||
}
|
||||
case PRFS_REQUESTED:
|
||||
Assert(MyPState->ring_receive == cleanup_index);
|
||||
prefetch_wait_for(cleanup_index);
|
||||
prefetch_set_unused(cleanup_index);
|
||||
break;
|
||||
case PRFS_RECEIVED:
|
||||
case PRFS_TAG_REMAINS:
|
||||
prefetch_set_unused(cleanup_index);
|
||||
break;
|
||||
default:
|
||||
pg_unreachable();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1206,7 +1102,7 @@ PageIsEmptyHeapPage(char *buffer)
|
||||
}
|
||||
|
||||
static void
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
|
||||
{
|
||||
XLogRecPtr lsn = PageGetLSN(buffer);
|
||||
|
||||
@@ -1220,7 +1116,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
||||
* correctness, the non-logged updates are not critical. But we want to
|
||||
* have a reasonably up-to-date VM and FSM in the page server.
|
||||
*/
|
||||
if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
|
||||
if (forknum == FSM_FORKNUM && !RecoveryInProgress())
|
||||
{
|
||||
/* FSM is never WAL-logged and we don't care. */
|
||||
XLogRecPtr recptr;
|
||||
@@ -1229,7 +1125,30 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
||||
XLogFlush(recptr);
|
||||
lsn = recptr;
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
|
||||
(errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
|
||||
blocknum,
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
}
|
||||
else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
|
||||
{
|
||||
/*
|
||||
* Always WAL-log vm. We should never miss clearing visibility map
|
||||
* bits.
|
||||
*
|
||||
* TODO Is it too bad for performance? Hopefully we do not evict
|
||||
* actively used vm too often.
|
||||
*/
|
||||
XLogRecPtr recptr;
|
||||
|
||||
recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
|
||||
XLogFlush(recptr);
|
||||
lsn = recptr;
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X",
|
||||
blocknum,
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
@@ -1624,7 +1543,6 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
char *buffer, bool skipFsync)
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
BlockNumber n_blocks = 0;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -1664,16 +1582,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
errhint("This limit is defined by neon.max_cluster_size GUC")));
|
||||
}
|
||||
|
||||
/*
|
||||
* Usually Postgres doesn't extend relation on more than one page
|
||||
* (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
|
||||
* call smgrextend for destination relation n using size of source relation
|
||||
*/
|
||||
get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks);
|
||||
while (n_blocks < blkno)
|
||||
neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
|
||||
|
||||
neon_wallog_page(reln, forkNum, blkno, buffer, false);
|
||||
neon_wallog_page(reln, forkNum, blkno, buffer);
|
||||
set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
|
||||
|
||||
lsn = PageGetLSN(buffer);
|
||||
@@ -1871,17 +1780,6 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
&request_lsn);
|
||||
slot = GetPrfSlot(ring_index);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Empty our reference to the prefetch buffer's hash entry.
|
||||
* When we wait for prefetches, the entry reference is invalidated by
|
||||
* potential updates to the hash, and when we reconnect to the
|
||||
* pageserver the prefetch we're waiting for may be dropped,
|
||||
* in which case we need to retry and take the branch above.
|
||||
*/
|
||||
entry = NULL;
|
||||
}
|
||||
|
||||
Assert(slot->my_ring_index == ring_index);
|
||||
Assert(MyPState->ring_last <= ring_index &&
|
||||
@@ -1920,7 +1818,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
/* buffer was used, clean up for later reuse */
|
||||
prefetch_set_unused(ring_index);
|
||||
prefetch_cleanup_trailing_unused();
|
||||
prefetch_cleanup();
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2101,7 +1999,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
neon_wallog_page(reln, forknum, blocknum, buffer, false);
|
||||
neon_wallog_page(reln, forknum, blocknum, buffer);
|
||||
|
||||
lsn = PageGetLSN(buffer);
|
||||
elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||
|
||||
@@ -28,7 +28,6 @@ use std::{borrow::Cow, future::Future, net::SocketAddr};
|
||||
use tokio::{net::TcpListener, task::JoinError};
|
||||
use tracing::info;
|
||||
use utils::project_git_version;
|
||||
use utils::sentry_init::{init_sentry, release_name};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
@@ -46,9 +45,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
.with_target(false)
|
||||
.init();
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(release_name!(), &[]);
|
||||
|
||||
let arg_matches = cli().get_matches();
|
||||
|
||||
let tls_config = match (
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# avoid running regular linting script that checks every feature.
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
# no extra features to test currently, add more here when needed
|
||||
cargo clippy --locked --all --all-targets -- -A unknown_lints -D warnings
|
||||
cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings
|
||||
else
|
||||
# * `-A unknown_lints` – do not warn about unknown lint suppressions
|
||||
# that people with newer toolchains might use
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
use anyhow::{bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, Command};
|
||||
use const_format::formatcp;
|
||||
use nix::unistd::Pid;
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{ErrorKind, Write};
|
||||
@@ -14,7 +15,7 @@ use tokio::sync::mpsc;
|
||||
use toml_edit::Document;
|
||||
use tracing::*;
|
||||
use url::{ParseError, Url};
|
||||
use utils::pid_file;
|
||||
use utils::lock_file;
|
||||
|
||||
use metrics::set_build_info_metric;
|
||||
use safekeeper::broker;
|
||||
@@ -34,14 +35,11 @@ use utils::{
|
||||
http::endpoint,
|
||||
id::NodeId,
|
||||
logging::{self, LogFormat},
|
||||
project_git_version,
|
||||
sentry_init::{init_sentry, release_name},
|
||||
signals, tcp_listener,
|
||||
project_git_version, signals, tcp_listener,
|
||||
};
|
||||
|
||||
const PID_FILE_NAME: &str = "safekeeper.pid";
|
||||
const ID_FILE_NAME: &str = "safekeeper.id";
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
@@ -135,8 +133,6 @@ fn main() -> anyhow::Result<()> {
|
||||
conf.log_format = LogFormat::from_config(log_format)?;
|
||||
}
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]);
|
||||
start_safekeeper(conf, given_id, arg_matches.get_flag("init"))
|
||||
}
|
||||
|
||||
@@ -146,13 +142,28 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
||||
|
||||
// Prevent running multiple safekeepers on the same directory
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
let lock_file =
|
||||
pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
|
||||
info!("Claimed pid file at {lock_file_path:?}");
|
||||
|
||||
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
|
||||
lock_file::LockCreationResult::Created {
|
||||
new_lock_contents,
|
||||
file,
|
||||
} => {
|
||||
info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
|
||||
file
|
||||
}
|
||||
lock_file::LockCreationResult::AlreadyLocked {
|
||||
existing_lock_contents,
|
||||
} => anyhow::bail!(
|
||||
"Could not lock pid file; safekeeper is already running in {:?} with PID {}",
|
||||
conf.workdir,
|
||||
existing_lock_contents
|
||||
),
|
||||
lock_file::LockCreationResult::CreationFailed(e) => {
|
||||
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
|
||||
}
|
||||
};
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
std::mem::forget(lock_file);
|
||||
let _ = Box::leak(Box::new(lock_file));
|
||||
|
||||
// Set or read our ID.
|
||||
set_id(&mut conf, given_id)?;
|
||||
|
||||
@@ -226,7 +226,6 @@ impl ReplicationConn {
|
||||
let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn);
|
||||
|
||||
let mut wal_reader = WalReader::new(
|
||||
spg.conf.workdir.clone(),
|
||||
spg.conf.timeline_dir(&tli.ttid),
|
||||
&persisted_state,
|
||||
start_pos,
|
||||
|
||||
@@ -13,7 +13,7 @@ use std::time::Duration;
|
||||
use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
|
||||
use postgres_ffi::XLogFileName;
|
||||
use postgres_ffi::{XLogSegNo, PG_TLI};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::fs::File;
|
||||
use tokio::runtime::Builder;
|
||||
|
||||
@@ -151,7 +151,7 @@ async fn update_task(
|
||||
let timeline_dir = conf.timeline_dir(&ttid);
|
||||
|
||||
let handle = tokio::spawn(
|
||||
backup_task_main(ttid, timeline_dir, conf.workdir.clone(), shutdown_rx)
|
||||
backup_task_main(ttid, timeline_dir, shutdown_rx)
|
||||
.instrument(info_span!("WAL backup task", ttid = %ttid)),
|
||||
);
|
||||
|
||||
@@ -182,10 +182,10 @@ async fn wal_backup_launcher_main_loop(
|
||||
|
||||
let conf_ = conf.clone();
|
||||
REMOTE_STORAGE.get_or_init(|| {
|
||||
conf_
|
||||
.remote_storage
|
||||
.as_ref()
|
||||
.map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
|
||||
conf_.remote_storage.as_ref().map(|c| {
|
||||
GenericRemoteStorage::from_config(conf_.workdir, c)
|
||||
.expect("failed to create remote storage")
|
||||
})
|
||||
});
|
||||
|
||||
// Presense in this map means launcher is aware s3 offloading is needed for
|
||||
@@ -234,7 +234,6 @@ async fn wal_backup_launcher_main_loop(
|
||||
struct WalBackupTask {
|
||||
timeline: Arc<Timeline>,
|
||||
timeline_dir: PathBuf,
|
||||
workspace_dir: PathBuf,
|
||||
wal_seg_size: usize,
|
||||
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
||||
}
|
||||
@@ -243,7 +242,6 @@ struct WalBackupTask {
|
||||
async fn backup_task_main(
|
||||
ttid: TenantTimelineId,
|
||||
timeline_dir: PathBuf,
|
||||
workspace_dir: PathBuf,
|
||||
mut shutdown_rx: Receiver<()>,
|
||||
) {
|
||||
info!("started");
|
||||
@@ -259,7 +257,6 @@ async fn backup_task_main(
|
||||
commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
|
||||
timeline: tli,
|
||||
timeline_dir,
|
||||
workspace_dir,
|
||||
};
|
||||
|
||||
// task is spinned up only when wal_seg_size already initialized
|
||||
@@ -324,7 +321,6 @@ impl WalBackupTask {
|
||||
commit_lsn,
|
||||
self.wal_seg_size,
|
||||
&self.timeline_dir,
|
||||
&self.workspace_dir,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -357,12 +353,11 @@ pub async fn backup_lsn_range(
|
||||
end_lsn: Lsn,
|
||||
wal_seg_size: usize,
|
||||
timeline_dir: &Path,
|
||||
workspace_dir: &Path,
|
||||
) -> Result<Lsn> {
|
||||
let mut res = start_lsn;
|
||||
let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
|
||||
for s in &segments {
|
||||
backup_single_segment(s, timeline_dir, workspace_dir)
|
||||
backup_single_segment(s, timeline_dir)
|
||||
.await
|
||||
.with_context(|| format!("offloading segno {}", s.seg_no))?;
|
||||
|
||||
@@ -377,24 +372,11 @@ pub async fn backup_lsn_range(
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn backup_single_segment(
|
||||
seg: &Segment,
|
||||
timeline_dir: &Path,
|
||||
workspace_dir: &Path,
|
||||
) -> Result<()> {
|
||||
let segment_file_path = seg.file_path(timeline_dir)?;
|
||||
let remote_segment_path = segment_file_path
|
||||
.strip_prefix(&workspace_dir)
|
||||
.context("Failed to strip workspace dir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}",
|
||||
)
|
||||
})?;
|
||||
async fn backup_single_segment(seg: &Segment, timeline_dir: &Path) -> Result<()> {
|
||||
let segment_file_name = seg.file_path(timeline_dir)?;
|
||||
|
||||
backup_object(&segment_file_path, &remote_segment_path, seg.size()).await?;
|
||||
debug!("Backup of {} done", segment_file_path.display());
|
||||
backup_object(&segment_file_name, seg.size()).await?;
|
||||
debug!("Backup of {} done", segment_file_name.display());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -444,7 +426,7 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
|
||||
|
||||
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
|
||||
|
||||
async fn backup_object(source_file: &Path, target_file: &RemotePath, size: usize) -> Result<()> {
|
||||
async fn backup_object(source_file: &Path, size: usize) -> Result<()> {
|
||||
let storage = REMOTE_STORAGE
|
||||
.get()
|
||||
.expect("failed to get remote storage")
|
||||
@@ -459,12 +441,12 @@ async fn backup_object(source_file: &Path, target_file: &RemotePath, size: usize
|
||||
})?);
|
||||
|
||||
storage
|
||||
.upload_storage_object(Box::new(file), size, target_file)
|
||||
.upload_storage_object(Box::new(file), size, source_file)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn read_object(
|
||||
file_path: &RemotePath,
|
||||
file_path: PathBuf,
|
||||
offset: u64,
|
||||
) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead>>> {
|
||||
let storage = REMOTE_STORAGE
|
||||
@@ -473,13 +455,19 @@ pub async fn read_object(
|
||||
.as_ref()
|
||||
.context("No remote storage configured")?;
|
||||
|
||||
info!("segment download about to start from remote path {file_path:?} at offset {offset}");
|
||||
|
||||
info!(
|
||||
"segment download about to start for local path {} at offset {}",
|
||||
file_path.display(),
|
||||
offset
|
||||
);
|
||||
let download = storage
|
||||
.download_storage_object(Some((offset, None)), file_path)
|
||||
.download_storage_object(Some((offset, None)), &file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open WAL segment download stream for remote path {file_path:?}")
|
||||
format!(
|
||||
"Failed to open WAL segment download stream for local path {}",
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(download.download_stream)
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
//! Note that last file has `.partial` suffix, that's different from postgres.
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use remote_storage::RemotePath;
|
||||
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::pin::Pin;
|
||||
@@ -446,7 +445,6 @@ fn remove_segments_from_disk(
|
||||
}
|
||||
|
||||
pub struct WalReader {
|
||||
workdir: PathBuf,
|
||||
timeline_dir: PathBuf,
|
||||
wal_seg_size: usize,
|
||||
pos: Lsn,
|
||||
@@ -461,7 +459,6 @@ pub struct WalReader {
|
||||
|
||||
impl WalReader {
|
||||
pub fn new(
|
||||
workdir: PathBuf,
|
||||
timeline_dir: PathBuf,
|
||||
state: &SafeKeeperState,
|
||||
start_pos: Lsn,
|
||||
@@ -481,7 +478,6 @@ impl WalReader {
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
workdir,
|
||||
timeline_dir,
|
||||
wal_seg_size: state.server.wal_seg_size as usize,
|
||||
pos: start_pos,
|
||||
@@ -549,17 +545,7 @@ impl WalReader {
|
||||
|
||||
// Try to open remote file, if remote reads are enabled
|
||||
if self.enable_remote_read {
|
||||
let remote_wal_file_path = wal_file_path
|
||||
.strip_prefix(&self.workdir)
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to resolve remote part of path {:?} for base {:?}",
|
||||
wal_file_path, self.workdir,
|
||||
)
|
||||
})?;
|
||||
return read_object(&remote_wal_file_path, xlogoff as u64).await;
|
||||
return read_object(wal_file_path, xlogoff as u64).await;
|
||||
}
|
||||
|
||||
bail!("WAL segment is not found")
|
||||
|
||||
@@ -1,11 +1,7 @@
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Generate rust code from .proto protobuf.
|
||||
//
|
||||
// Note: we previously tried to use deterministic location at proto/ for
|
||||
// easy location, but apparently interference with cachepot sometimes fails
|
||||
// the build then. Anyway, per cargo docs build script shouldn't output to
|
||||
// anywhere but $OUT_DIR.
|
||||
tonic_build::compile_protos("proto/broker.proto")
|
||||
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
|
||||
// Generate code to deterministic location to make finding it easier.
|
||||
tonic_build::configure()
|
||||
.out_dir("proto/") // put generated code to proto/
|
||||
.compile(&["proto/broker.proto"], &["proto/"])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ use proto::{
|
||||
|
||||
// Code generated by protobuf.
|
||||
pub mod proto {
|
||||
tonic::include_proto!("storage_broker");
|
||||
include!("../proto/storage_broker.rs");
|
||||
}
|
||||
|
||||
pub mod metrics;
|
||||
|
||||
@@ -6,6 +6,9 @@ Prerequisites:
|
||||
- Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python)
|
||||
- Neon and Postgres binaries
|
||||
- See the root [README.md](/README.md) for build directions
|
||||
If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands.
|
||||
For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags.
|
||||
Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release`
|
||||
- Tests can be run from the git tree; or see the environment variables
|
||||
below to run from other directories.
|
||||
- The neon git repo, including the postgres submodule
|
||||
|
||||
@@ -33,7 +33,7 @@ from _pytest.config import Config
|
||||
from _pytest.fixtures import FixtureRequest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture
|
||||
from fixtures.utils import Fn, allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture
|
||||
|
||||
# Type-related stuff
|
||||
from psycopg2.extensions import connection as PgConnection
|
||||
@@ -587,7 +587,6 @@ class NeonEnvBuilder:
|
||||
auth_enabled: bool = False,
|
||||
rust_log_override: Optional[str] = None,
|
||||
default_branch_name: str = DEFAULT_BRANCH_NAME,
|
||||
testing_mode: bool = True,
|
||||
):
|
||||
self.repo_dir = repo_dir
|
||||
self.rust_log_override = rust_log_override
|
||||
@@ -609,7 +608,6 @@ class NeonEnvBuilder:
|
||||
self.neon_binpath = neon_binpath
|
||||
self.pg_distrib_dir = pg_distrib_dir
|
||||
self.pg_version = pg_version
|
||||
self.testing_mode = testing_mode
|
||||
|
||||
def init(self) -> NeonEnv:
|
||||
# Cannot create more than one environment from one builder
|
||||
@@ -860,7 +858,6 @@ class NeonEnv:
|
||||
http=self.port_distributor.get_port(),
|
||||
)
|
||||
pageserver_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
|
||||
pageserver_testing_mode = "true" if config.testing_mode else "false"
|
||||
|
||||
toml += textwrap.dedent(
|
||||
f"""
|
||||
@@ -869,7 +866,6 @@ class NeonEnv:
|
||||
listen_pg_addr = 'localhost:{pageserver_port.pg}'
|
||||
listen_http_addr = 'localhost:{pageserver_port.http}'
|
||||
auth_type = '{pageserver_auth_type}'
|
||||
testing_mode = {pageserver_testing_mode}
|
||||
"""
|
||||
)
|
||||
|
||||
@@ -982,10 +978,6 @@ def _shared_simple_env(
|
||||
pg_distrib_dir=pg_distrib_dir,
|
||||
pg_version=pg_version,
|
||||
run_id=run_id,
|
||||
# Disable failpoint support. Failpoints could have unexpected consequences
|
||||
# when the pageserver is shared by concurrent tests. Also, it might affect
|
||||
# performance, and we use the shared simple env in performance tests.
|
||||
testing_mode=False,
|
||||
) as builder:
|
||||
env = builder.init_start()
|
||||
|
||||
@@ -1056,10 +1048,11 @@ class PageserverApiException(Exception):
|
||||
|
||||
|
||||
class PageserverHttpClient(requests.Session):
|
||||
def __init__(self, port: int, auth_token: Optional[str] = None):
|
||||
def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
|
||||
super().__init__()
|
||||
self.port = port
|
||||
self.auth_token = auth_token
|
||||
self.is_testing_enabled_or_skip = is_testing_enabled_or_skip
|
||||
|
||||
if auth_token is not None:
|
||||
self.headers["Authorization"] = f"Bearer {auth_token}"
|
||||
@@ -1078,6 +1071,8 @@ class PageserverHttpClient(requests.Session):
|
||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
if isinstance(config_strings, tuple):
|
||||
pairs = [config_strings]
|
||||
else:
|
||||
@@ -1124,14 +1119,6 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_load(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_ignore(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
self.verbose_error(res)
|
||||
@@ -1217,6 +1204,8 @@ class PageserverHttpClient(requests.Session):
|
||||
def timeline_gc(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
|
||||
) -> dict[str, Any]:
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(
|
||||
f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}"
|
||||
)
|
||||
@@ -1232,6 +1221,8 @@ class PageserverHttpClient(requests.Session):
|
||||
return res_json
|
||||
|
||||
def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact"
|
||||
@@ -1255,6 +1246,8 @@ class PageserverHttpClient(requests.Session):
|
||||
return res_json
|
||||
|
||||
def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint"
|
||||
@@ -1768,13 +1761,6 @@ class NeonPageserver(PgProtocol):
|
||||
".*Removing intermediate uninit mark file.*",
|
||||
# FIXME: known race condition in TaskHandle: https://github.com/neondatabase/neon/issues/2885
|
||||
".*sender is dropped while join handle is still alive.*",
|
||||
# Tenant::delete_timeline() can cause any of the four following errors.
|
||||
# FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
|
||||
".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed
|
||||
".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped
|
||||
".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs
|
||||
".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs
|
||||
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
|
||||
]
|
||||
|
||||
def start(
|
||||
@@ -1814,6 +1800,10 @@ class NeonPageserver(PgProtocol):
|
||||
):
|
||||
self.stop(immediate=True)
|
||||
|
||||
def is_testing_enabled_or_skip(self):
|
||||
if '"testing"' not in self.version:
|
||||
pytest.skip("pageserver was built without 'testing' feature")
|
||||
|
||||
def is_profiling_enabled_or_skip(self):
|
||||
if '"profiling"' not in self.version:
|
||||
pytest.skip("pageserver was built without 'profiling' feature")
|
||||
@@ -1822,6 +1812,7 @@ class NeonPageserver(PgProtocol):
|
||||
return PageserverHttpClient(
|
||||
port=self.service_port.http,
|
||||
auth_token=auth_token,
|
||||
is_testing_enabled_or_skip=self.is_testing_enabled_or_skip,
|
||||
)
|
||||
|
||||
def assert_no_errors(self):
|
||||
@@ -2537,7 +2528,6 @@ class SafekeeperTimelineStatus:
|
||||
acceptor_epoch: int
|
||||
pg_version: int
|
||||
flush_lsn: Lsn
|
||||
commit_lsn: Lsn
|
||||
timeline_start_lsn: Lsn
|
||||
backup_lsn: Lsn
|
||||
remote_consistent_lsn: Lsn
|
||||
@@ -2587,7 +2577,6 @@ class SafekeeperHttpClient(requests.Session):
|
||||
acceptor_epoch=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
flush_lsn=Lsn(resj["flush_lsn"]),
|
||||
commit_lsn=Lsn(resj["commit_lsn"]),
|
||||
timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
|
||||
backup_lsn=Lsn(resj["backup_lsn"]),
|
||||
remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
First make a release build. The profiling flag is optional, used only for tests that
|
||||
generate flame graphs. The `-s` flag just silences a lot of output, and makes it
|
||||
easier to see if you have compile errors without scrolling up.
|
||||
`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=profiling" make -s -j8`
|
||||
`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8`
|
||||
|
||||
NOTE: the `profiling` flag only works on linux because we use linux-specific
|
||||
libc APIs like `libc::timer_t`.
|
||||
|
||||
@@ -42,8 +42,7 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor)
|
||||
|
||||
cur.execute("drop table t")
|
||||
cur.execute("set enable_seqscan_prefetch=on")
|
||||
cur.execute("set effective_io_concurrency=32")
|
||||
cur.execute("set maintenance_io_concurrency=32")
|
||||
cur.execute("set seqscan_prefetch_buffers=100")
|
||||
|
||||
cur.execute(f"create table t2(x integer) WITH (fillfactor={fillfactor})")
|
||||
|
||||
|
||||
@@ -12,12 +12,12 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
|
||||
n_iters = 10
|
||||
n_records = 100000
|
||||
|
||||
# We want to have a lot of lot of layer files to exercise the layer map. Disable
|
||||
# GC, and make checkpoint_distance very small, so that we get a lot of small layer
|
||||
# files.
|
||||
# We want to have a lot of lot of layer files to exercise the layer map. Make
|
||||
# gc_horizon and checkpoint_distance very small, so that we get a lot of small layer files.
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
"gc_period": "0s",
|
||||
"gc_period": "100 m",
|
||||
"gc_horizon": "1048576",
|
||||
"checkpoint_distance": "8192",
|
||||
"compaction_period": "1 s",
|
||||
"compaction_threshold": "1",
|
||||
|
||||
@@ -1,111 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Tuple
|
||||
|
||||
import pytest
|
||||
from fixtures.compare_fixtures import RemoteCompare
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
@dataclass
|
||||
class LabelledQuery:
|
||||
"""An SQL query with a label for the test report."""
|
||||
|
||||
label: str
|
||||
query: str
|
||||
|
||||
|
||||
# A list of queries to run.
|
||||
# Please do not alter the label for the query, as it is used to identify it.
|
||||
# Labels for ClickBench queries match the labels in ClickBench reports
|
||||
# on https://benchmark.clickhouse.com/ (the DB size may differ).
|
||||
QUERIES: Tuple[LabelledQuery, ...] = (
|
||||
# Disable `black` formatting for the list of queries so that it's easier to read
|
||||
# fmt: off
|
||||
### ClickBench queries:
|
||||
LabelledQuery("Q0", r"SELECT COUNT(*) FROM hits;"),
|
||||
LabelledQuery("Q1", r"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;"),
|
||||
LabelledQuery("Q2", r"SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;"),
|
||||
LabelledQuery("Q3", r"SELECT AVG(UserID) FROM hits;"),
|
||||
LabelledQuery("Q4", r"SELECT COUNT(DISTINCT UserID) FROM hits;"),
|
||||
LabelledQuery("Q5", r"SELECT COUNT(DISTINCT SearchPhrase) FROM hits;"),
|
||||
LabelledQuery("Q6", r"SELECT MIN(EventDate), MAX(EventDate) FROM hits;"),
|
||||
LabelledQuery("Q7", r"SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;"),
|
||||
LabelledQuery("Q8", r"SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;"),
|
||||
LabelledQuery("Q9", r"SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;"),
|
||||
LabelledQuery("Q10", r"SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;"),
|
||||
LabelledQuery("Q11", r"SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;"),
|
||||
LabelledQuery("Q12", r"SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"),
|
||||
LabelledQuery("Q13", r"SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;"),
|
||||
LabelledQuery("Q14", r"SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;"),
|
||||
LabelledQuery("Q15", r"SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;"),
|
||||
LabelledQuery("Q16", r"SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;"),
|
||||
LabelledQuery("Q17", r"SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;"),
|
||||
LabelledQuery("Q18", r"SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;"),
|
||||
LabelledQuery("Q19", r"SELECT UserID FROM hits WHERE UserID = 435090932899640449;"),
|
||||
LabelledQuery("Q20", r"SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';"),
|
||||
LabelledQuery("Q21", r"SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"),
|
||||
LabelledQuery("Q22", r"SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"),
|
||||
LabelledQuery("Q23", r"SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;"),
|
||||
LabelledQuery("Q24", r"SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;"),
|
||||
LabelledQuery("Q25", r"SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;"),
|
||||
LabelledQuery("Q26", r"SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;"),
|
||||
LabelledQuery("Q27", r"SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;"),
|
||||
LabelledQuery("Q28", r"SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;"),
|
||||
LabelledQuery("Q29", r"SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;"),
|
||||
LabelledQuery("Q30", r"SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;"),
|
||||
LabelledQuery("Q31", r"SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;"),
|
||||
LabelledQuery("Q32", r"SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;"),
|
||||
LabelledQuery("Q33", r"SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;"),
|
||||
LabelledQuery("Q34", r"SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;"),
|
||||
LabelledQuery("Q35", r"SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;"),
|
||||
LabelledQuery("Q36", r"SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;"),
|
||||
LabelledQuery("Q37", r"SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;"),
|
||||
LabelledQuery("Q38", r"SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;"),
|
||||
LabelledQuery("Q39", r"SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;"),
|
||||
LabelledQuery("Q40", r"SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;"),
|
||||
LabelledQuery("Q41", r"SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;"),
|
||||
LabelledQuery("Q42", r"SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;"),
|
||||
### Custom Neon queries:
|
||||
# I suggest using the NQ prefix (which stands for Neon Query) instead of Q
|
||||
# to not intersect with the original ClickBench queries if their list is extended.
|
||||
#
|
||||
# LabelledQuery("NQ0", r"..."),
|
||||
# LabelledQuery("NQ1", r"..."),
|
||||
# ...
|
||||
# fmt: on
|
||||
)
|
||||
|
||||
|
||||
def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> None:
|
||||
# prepare connstr:
|
||||
# - cut out password from connstr to pass it via env
|
||||
# - add options to connstr
|
||||
password = env.pg.default_options.get("password", None)
|
||||
options = f"-cstatement_timeout=0 {env.pg.default_options.get('options', '')}"
|
||||
connstr = env.pg.connstr(password=None, options=options)
|
||||
|
||||
environ: Dict[str, str] = {}
|
||||
if password is not None:
|
||||
environ["PGPASSWORD"] = password
|
||||
|
||||
label, query = labelled_query.label, labelled_query.query
|
||||
|
||||
log.info(f"Running query {label} {times} times")
|
||||
for i in range(times):
|
||||
run = i + 1
|
||||
log.info(f"Run {run}/{times}")
|
||||
with env.zenbenchmark.record_duration(f"{label}/{run}"):
|
||||
env.pg_bin.run_capture(["psql", connstr, "-c", query], env=environ)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("query", QUERIES)
|
||||
@pytest.mark.remote_cluster
|
||||
def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare):
|
||||
"""
|
||||
An OLAP-style ClickHouse benchmark
|
||||
|
||||
Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql
|
||||
The DB prepared manually in advance
|
||||
"""
|
||||
|
||||
run_psql(remote_compare, query, times=3)
|
||||
@@ -1,14 +1,10 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_record_lsn
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
# This test demonstrates how to collect a read trace. It's useful until
|
||||
# it gets replaced by a test that actually does stuff with the trace.
|
||||
#
|
||||
# Additionally, tests that pageserver is able to create tenants with custom configs.
|
||||
def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 1
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -27,12 +23,6 @@ def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
|
||||
cur.execute("create table t (i integer);")
|
||||
cur.execute(f"insert into t values (generate_series(1,{10000}));")
|
||||
cur.execute("select count(*) from t;")
|
||||
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
|
||||
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
|
||||
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
# wait until pageserver receives that data
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
|
||||
|
||||
# Stop pg so we drop the connection and flush the traces
|
||||
pg.stop()
|
||||
@@ -11,12 +11,16 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
# Override defaults: 4M checkpoint_distance, disable background compaction and gc.
|
||||
# Override defaults, 1M gc_horizon and 4M checkpoint_distance.
|
||||
# Extend compaction_period and gc_period to disable background compaction and gc.
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
"gc_period": "10 m",
|
||||
"gc_horizon": "1048576",
|
||||
"checkpoint_distance": "4194304",
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
"compaction_period": "10 m",
|
||||
"compaction_threshold": "2",
|
||||
"compaction_target_size": "4194304",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -52,7 +52,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv):
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
# disable background GC
|
||||
"gc_period": "0s",
|
||||
"gc_period": "10 m",
|
||||
"gc_horizon": f"{10 * 1024 ** 3}",
|
||||
# small checkpoint distance to create more delta layer files
|
||||
"checkpoint_distance": f"{1024 ** 2}",
|
||||
# set the target size to be large to allow the image layer to cover the whole key space
|
||||
@@ -126,7 +127,8 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
# disable background GC
|
||||
"gc_period": "0s",
|
||||
"gc_period": "10 m",
|
||||
"gc_horizon": f"{10 * 1024 ** 3}",
|
||||
# small checkpoint distance to create more delta layer files
|
||||
"checkpoint_distance": f"{1024 ** 2}",
|
||||
# set the target size to be large to allow the image layer to cover the whole key space
|
||||
|
||||
@@ -327,6 +327,7 @@ def check_neon_works(
|
||||
auth_token = snapshot_config["pageserver"]["auth_token"]
|
||||
pageserver_http = PageserverHttpClient(
|
||||
port=pageserver_port,
|
||||
is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled
|
||||
auth_token=auth_token,
|
||||
)
|
||||
|
||||
|
||||
@@ -120,12 +120,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
|
||||
]
|
||||
)
|
||||
|
||||
# Importing empty file fails
|
||||
empty_file = os.path.join(test_output_dir, "empty_file")
|
||||
with open(empty_file, "w") as _:
|
||||
with pytest.raises(Exception):
|
||||
import_tar(empty_file, empty_file)
|
||||
|
||||
# Importing corrupt backup fails
|
||||
with pytest.raises(Exception):
|
||||
import_tar(corrupt_base_tar, wal_tar)
|
||||
|
||||
@@ -13,6 +13,7 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
|
||||
|
||||
env = neon_env_builder.init()
|
||||
env.pageserver.is_testing_enabled_or_skip()
|
||||
|
||||
neon_env_builder.start()
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
@@ -12,7 +11,6 @@ import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
RemoteStorageKind,
|
||||
assert_no_in_progress_downloads_for_tenant,
|
||||
available_remote_storages,
|
||||
@@ -71,10 +69,8 @@ def test_remote_storage_backup_and_restore(
|
||||
# FIXME retry downloads without throwing errors
|
||||
env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
|
||||
# we have a bunch of pytest.raises for these below
|
||||
env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*Cannot attach tenant .*?, local tenant directory already exists.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(".*tenant already exists.*")
|
||||
env.pageserver.allowed_errors.append(".*attach is already in progress.*")
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
@@ -112,8 +108,9 @@ def test_remote_storage_backup_and_restore(
|
||||
# run checkpoint manually to be sure that data landed in remote storage
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# wait until pageserver successfully uploaded a checkpoint to remote storage
|
||||
log.info(f"waiting for checkpoint {checkpoint_number} upload")
|
||||
|
||||
# wait until pageserver successfully uploaded a checkpoint to remote storage
|
||||
wait_for_upload(client, tenant_id, timeline_id, current_lsn)
|
||||
log.info(f"upload of checkpoint {checkpoint_number} is done")
|
||||
|
||||
@@ -138,7 +135,7 @@ def test_remote_storage_backup_and_restore(
|
||||
|
||||
# assert cannot attach timeline that is scheduled for download
|
||||
# FIXME implement layer download retries
|
||||
with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"):
|
||||
with pytest.raises(Exception, match="tenant already exists, current state: Broken"):
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
tenant_status = client.tenant_status(tenant_id)
|
||||
@@ -151,7 +148,9 @@ def test_remote_storage_backup_and_restore(
|
||||
env.pageserver.start()
|
||||
|
||||
# ensure that an initiated attach operation survives pageserver restart
|
||||
with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"):
|
||||
with pytest.raises(
|
||||
Exception, match=r".*(tenant already exists|attach is already in progress).*"
|
||||
):
|
||||
client.tenant_attach(tenant_id)
|
||||
log.info("waiting for timeline redownload")
|
||||
wait_until(
|
||||
@@ -165,6 +164,7 @@ def test_remote_storage_backup_and_restore(
|
||||
assert (
|
||||
Lsn(detail["last_record_lsn"]) >= current_lsn
|
||||
), "current db Lsn should should not be less than the one stored on remote storage"
|
||||
assert not detail["awaits_download"]
|
||||
|
||||
pg = env.postgres.create_start("main")
|
||||
with pg.cursor() as cur:
|
||||
@@ -190,7 +190,7 @@ def test_remote_storage_upload_queue_retries(
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_storage_upload_queue_retries",
|
||||
test_name="test_remote_storage_backup_and_restore",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -199,17 +199,15 @@ def test_remote_storage_upload_queue_retries(
|
||||
# compaction and gc
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
||||
"checkpoint_distance": f"{128 * 1024}",
|
||||
# small checkpointing and compaction targets to ensure we generate many operations
|
||||
"checkpoint_distance": f"{32 * 1024}",
|
||||
"compaction_threshold": "1",
|
||||
"compaction_target_size": f"{128 * 1024}",
|
||||
# no PITR horizon, we specify the horizon when we request on-demand GC
|
||||
"compaction_target_size": f"{32 * 1024}",
|
||||
# large horizon to avoid automatic GC (our assert on gc_result below relies on that)
|
||||
"gc_horizon": f"{1024 ** 4}",
|
||||
"gc_period": "1h",
|
||||
# disable PITR so that GC considers just gc_horizon
|
||||
"pitr_interval": "0s",
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# don't create image layers, that causes just noise
|
||||
"image_creation_threshold": "10000",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -273,47 +271,27 @@ def test_remote_storage_upload_queue_retries(
|
||||
# let all future operations queue up
|
||||
configure_storage_sync_failpoints("return")
|
||||
|
||||
# Create more churn to generate all upload ops.
|
||||
# The checkpoint / compact / gc ops will block because they call remote_client.wait_completion().
|
||||
# So, run this in a differen thread.
|
||||
churn_thread_result = [False]
|
||||
# create more churn to generate all upload ops
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c")
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d")
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
|
||||
print_gc_result(gc_result)
|
||||
assert gc_result["layers_removed"] > 0
|
||||
|
||||
def churn_while_failpoints_active(result):
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c")
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d")
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
|
||||
print_gc_result(gc_result)
|
||||
assert gc_result["layers_removed"] > 0
|
||||
result[0] = True
|
||||
# ensure that all operation types that can be in the upload queue have queued up
|
||||
assert get_queued_count(file_kind="layer", op_kind="upload") > 0
|
||||
assert get_queued_count(file_kind="index", op_kind="upload") >= 2
|
||||
assert get_queued_count(file_kind="layer", op_kind="delete") > 0
|
||||
|
||||
churn_while_failpoints_active_thread = threading.Thread(
|
||||
target=churn_while_failpoints_active, args=[churn_thread_result]
|
||||
)
|
||||
churn_while_failpoints_active_thread.start()
|
||||
|
||||
# wait for churn thread's data to get stuck in the upload queue
|
||||
wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0)
|
||||
wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2)
|
||||
wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0)
|
||||
|
||||
# unblock churn operations
|
||||
# unblock all operations and wait for them to finish
|
||||
configure_storage_sync_failpoints("off")
|
||||
|
||||
# ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
|
||||
wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
|
||||
wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
|
||||
wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
|
||||
|
||||
# The churn thread doesn't make progress once it blocks on the first wait_completion() call,
|
||||
# so, give it some time to wrap up.
|
||||
churn_while_failpoints_active_thread.join(30)
|
||||
assert not churn_while_failpoints_active_thread.is_alive()
|
||||
assert churn_thread_result[0]
|
||||
|
||||
# try a restore to verify that the uploads worked
|
||||
# XXX: should vary this test to selectively fail just layer uploads, index uploads, deletions
|
||||
# but how do we validate the result after restore?
|
||||
@@ -352,7 +330,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_timeline_deletion_with_files_stuck_in_upload_queue",
|
||||
test_name="test_remote_storage_backup_and_restore",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -372,20 +350,9 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
"pitr_interval": "0s",
|
||||
}
|
||||
)
|
||||
timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
def get_queued_count(file_kind, op_kind):
|
||||
metrics = client.get_metrics()
|
||||
matches = re.search(
|
||||
f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
|
||||
metrics,
|
||||
re.MULTILINE,
|
||||
)
|
||||
assert matches
|
||||
return int(matches[1])
|
||||
|
||||
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
||||
|
||||
client.configure_failpoints(("before-upload-layer", "return"))
|
||||
@@ -397,40 +364,26 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
]
|
||||
)
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# Kick off a checkpoint operation.
|
||||
# It will get stuck in remote_client.wait_completion(), since the select query will have
|
||||
# generated layer upload ops already.
|
||||
checkpoint_allowed_to_fail = threading.Event()
|
||||
timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
|
||||
assert timeline_path.exists()
|
||||
assert len(list(timeline_path.glob("*"))) >= 8
|
||||
|
||||
def checkpoint_thread_fn():
|
||||
try:
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
except PageserverApiException:
|
||||
assert (
|
||||
checkpoint_allowed_to_fail.is_set()
|
||||
), "checkpoint op should only fail in response to timeline deletion"
|
||||
def get_queued_count(file_kind, op_kind):
|
||||
metrics = client.get_metrics()
|
||||
matches = re.search(
|
||||
f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
|
||||
metrics,
|
||||
re.MULTILINE,
|
||||
)
|
||||
assert matches
|
||||
return int(matches[1])
|
||||
|
||||
checkpoint_thread = threading.Thread(target=checkpoint_thread_fn)
|
||||
checkpoint_thread.start()
|
||||
assert get_queued_count(file_kind="index", op_kind="upload") > 0
|
||||
|
||||
# Wait for stuck uploads. NB: if there were earlier layer flushes initiated during `INSERT INTO`,
|
||||
# this will be their uploads. If there were none, it's the timeline_checkpoint()'s uploads.
|
||||
def assert_compacted_and_uploads_queued():
|
||||
assert timeline_path.exists()
|
||||
assert len(list(timeline_path.glob("*"))) >= 8
|
||||
assert get_queued_count(file_kind="index", op_kind="upload") > 0
|
||||
|
||||
wait_until(20, 0.1, assert_compacted_and_uploads_queued)
|
||||
|
||||
# Regardless, give checkpoint some time to block for good.
|
||||
# Not strictly necessary, but might help uncover failure modes in the future.
|
||||
time.sleep(2)
|
||||
|
||||
# Now delete the timeline. It should take priority over ongoing
|
||||
# checkpoint operations. Hence, checkpoint is allowed to fail now.
|
||||
# timeline delete should work despite layer files stuck in upload
|
||||
log.info("sending delete request")
|
||||
checkpoint_allowed_to_fail.set()
|
||||
client.timeline_delete(tenant_id, timeline_id)
|
||||
|
||||
assert not timeline_path.exists()
|
||||
@@ -438,10 +391,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
# timeline deletion should kill ongoing uploads
|
||||
assert get_queued_count(file_kind="index", op_kind="upload") == 0
|
||||
|
||||
# timeline deletion should be unblocking checkpoint ops
|
||||
checkpoint_thread.join(2.0)
|
||||
assert not checkpoint_thread.is_alive()
|
||||
|
||||
# Just to be sure, unblock ongoing uploads. If the previous assert was incorrect, or the prometheus metric broken,
|
||||
# this would likely generate some ERROR level log entries that the NeonEnvBuilder would detect
|
||||
client.configure_failpoints(("before-upload-layer", "off"))
|
||||
|
||||
@@ -7,7 +7,6 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
PageserverHttpClient,
|
||||
Postgres,
|
||||
RemoteStorageKind,
|
||||
available_remote_storages,
|
||||
wait_for_last_record_lsn,
|
||||
@@ -168,337 +167,3 @@ def test_detach_while_attaching(
|
||||
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("SELECT COUNT(*) FROM foo")
|
||||
|
||||
|
||||
# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory.
|
||||
# * writes some data into tenant's timeline
|
||||
# * ensures it's synced with the remote storage
|
||||
# * `ignore` the tenant
|
||||
# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared
|
||||
# * verify the ignored tenant is gone from pageserver's memory
|
||||
# * restart the pageserver and verify that ignored tenant is still not loaded
|
||||
# * `load` the same tenant
|
||||
# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3])
|
||||
def test_ignored_tenant_reattach(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_storage_backup_and_restore",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
ignored_tenant_id, _ = env.neon_cli.create_tenant()
|
||||
tenant_dir = env.repo_dir / "tenants" / str(ignored_tenant_id)
|
||||
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
tenants_before_ignore.sort()
|
||||
timelines_before_ignore = [
|
||||
timeline["timeline_id"]
|
||||
for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
|
||||
]
|
||||
files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
|
||||
|
||||
# ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk
|
||||
pageserver_http.tenant_ignore(ignored_tenant_id)
|
||||
|
||||
files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
|
||||
new_files = set(files_after_ignore_with_retain) - set(files_before_ignore)
|
||||
disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain)
|
||||
assert (
|
||||
len(disappeared_files) == 0
|
||||
), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}"
|
||||
assert (
|
||||
len(new_files) == 1
|
||||
), f"Only tenant ignore file should appear on disk but got: {new_files}"
|
||||
|
||||
tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
|
||||
assert len(tenants_after_ignore) + 1 == len(
|
||||
tenants_before_ignore
|
||||
), "Only ignored tenant should be missing"
|
||||
|
||||
# restart the pageserver to ensure we don't load the ignore timeline
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
tenants_after_restart.sort()
|
||||
assert (
|
||||
tenants_after_restart == tenants_after_ignore
|
||||
), "Ignored tenant should not be reloaded after pageserver restart"
|
||||
|
||||
# now, load it from the local files and expect it works
|
||||
pageserver_http.tenant_load(tenant_id=ignored_tenant_id)
|
||||
wait_until_tenant_status(pageserver_http, ignored_tenant_id, "Active", 5)
|
||||
|
||||
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
tenants_after_attach.sort()
|
||||
assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
|
||||
|
||||
timelines_after_ignore = [
|
||||
timeline["timeline_id"]
|
||||
for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
|
||||
]
|
||||
assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
|
||||
|
||||
|
||||
# Tests that it's possible to `load` tenants with missing layers and get them restored:
|
||||
# * writes some data into tenant's timeline
|
||||
# * ensures it's synced with the remote storage
|
||||
# * `ignore` the tenant
|
||||
# * removes all timeline's local layers
|
||||
# * `load` the same tenant
|
||||
# * ensure that it's status is `Active`
|
||||
# * check that timeline data is restored
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_ignored_tenant_download_missing_layers(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ignored_tenant_download_and_attach",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
|
||||
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
|
||||
|
||||
data_id = 1
|
||||
data_secret = "very secret secret"
|
||||
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg)
|
||||
|
||||
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
tenants_before_ignore.sort()
|
||||
timelines_before_ignore = [
|
||||
timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
|
||||
]
|
||||
|
||||
# ignore the tenant and remove its layers
|
||||
pageserver_http.tenant_ignore(tenant_id)
|
||||
tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
|
||||
layers_removed = False
|
||||
for dir_entry in tenant_timeline_dir.iterdir():
|
||||
if dir_entry.name.startswith("00000"):
|
||||
# Looks like a layer file. Remove it
|
||||
dir_entry.unlink()
|
||||
layers_removed = True
|
||||
assert layers_removed, f"Found no layers for tenant {tenant_timeline_dir}"
|
||||
|
||||
# now, load it from the local files and expect it to work due to remote storage restoration
|
||||
pageserver_http.tenant_load(tenant_id=tenant_id)
|
||||
wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5)
|
||||
|
||||
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
tenants_after_attach.sort()
|
||||
assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
|
||||
|
||||
timelines_after_ignore = [
|
||||
timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
|
||||
]
|
||||
assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
|
||||
|
||||
pg.stop()
|
||||
pg.start()
|
||||
ensure_test_data(data_id, data_secret, pg)
|
||||
|
||||
|
||||
# Tests that it's possible to `load` broken tenants:
|
||||
# * `ignore` a tenant
|
||||
# * removes its `metadata` file locally
|
||||
# * `load` the same tenant
|
||||
# * ensure that it's status is `Broken`
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_ignored_tenant_stays_broken_without_metadata(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ignored_tenant_stays_broken_without_metadata",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
|
||||
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
|
||||
|
||||
# ignore the tenant and remove its metadata
|
||||
pageserver_http.tenant_ignore(tenant_id)
|
||||
tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
|
||||
metadata_removed = False
|
||||
for dir_entry in tenant_timeline_dir.iterdir():
|
||||
if dir_entry.name == "metadata":
|
||||
# Looks like a layer file. Remove it
|
||||
dir_entry.unlink()
|
||||
metadata_removed = True
|
||||
assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}"
|
||||
|
||||
env.pageserver.allowed_errors.append(".*could not load tenant .*?: failed to load metadata.*")
|
||||
|
||||
# now, load it from the local files and expect it to be broken due to inability to load tenant files into memory
|
||||
pageserver_http.tenant_load(tenant_id=tenant_id)
|
||||
wait_until_tenant_status(pageserver_http, tenant_id, "Broken", 5)
|
||||
|
||||
|
||||
# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
|
||||
# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_load_attach_negatives(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_load_attach_negatives",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
|
||||
|
||||
env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException,
|
||||
match=f"tenant {tenant_id} already exists, state: Active",
|
||||
):
|
||||
pageserver_http.tenant_load(tenant_id)
|
||||
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException,
|
||||
match=f"tenant {tenant_id} already exists, state: Active",
|
||||
):
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
|
||||
pageserver_http.tenant_ignore(tenant_id)
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*Cannot attach tenant .*?, local tenant directory already exists.*"
|
||||
)
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException,
|
||||
match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
|
||||
):
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_ignore_while_attaching(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ignore_while_attaching",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
|
||||
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
|
||||
|
||||
data_id = 1
|
||||
data_secret = "very secret secret"
|
||||
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg)
|
||||
|
||||
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
|
||||
# Detach it
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
# And re-attach, but stop attach task_mgr task from completing
|
||||
pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
# Run ignore on the task, thereby cancelling the attach.
|
||||
# XXX This should take priority over attach, i.e., it should cancel the attach task.
|
||||
# But neither the failpoint, nor the proper storage_sync2 download functions,
|
||||
# are sensitive to task_mgr::shutdown.
|
||||
# This problem is tracked in https://github.com/neondatabase/neon/issues/2996 .
|
||||
# So, for now, effectively, this ignore here will block until attach task completes.
|
||||
pageserver_http.tenant_ignore(tenant_id)
|
||||
|
||||
# Cannot attach it due to some local files existing
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*Cannot attach tenant .*?, local tenant directory already exists.*"
|
||||
)
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException,
|
||||
match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
|
||||
):
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
|
||||
tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
|
||||
assert len(tenants_after_ignore) + 1 == len(
|
||||
tenants_before_ignore
|
||||
), "Only ignored tenant should be missing"
|
||||
|
||||
# But can load it from local files, that will restore attach.
|
||||
pageserver_http.tenant_load(tenant_id)
|
||||
|
||||
wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5)
|
||||
|
||||
pg.stop()
|
||||
pg.start()
|
||||
ensure_test_data(data_id, data_secret, pg)
|
||||
|
||||
|
||||
def insert_test_data(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
data_id: int,
|
||||
data: str,
|
||||
pg: Postgres,
|
||||
):
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(
|
||||
f"""
|
||||
CREATE TABLE test(id int primary key, secret text);
|
||||
INSERT INTO test VALUES ({data_id}, '{data}');
|
||||
"""
|
||||
)
|
||||
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
|
||||
# wait until pageserver receives that data
|
||||
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
|
||||
|
||||
# run checkpoint manually to be sure that data landed in remote storage
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# wait until pageserver successfully uploaded a checkpoint to remote storage
|
||||
log.info("waiting for to be ignored tenant data checkpoint upload")
|
||||
wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
|
||||
|
||||
|
||||
def ensure_test_data(data_id: int, data: str, pg: Postgres):
|
||||
with pg.cursor() as cur:
|
||||
assert (
|
||||
query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data
|
||||
), "Should have timeline data back"
|
||||
|
||||
|
||||
# Does not use `wait_until` for debugging purposes
|
||||
def wait_until_tenant_status(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
expected_status: str,
|
||||
iterations: int,
|
||||
) -> bool:
|
||||
for _ in range(iterations):
|
||||
try:
|
||||
tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
|
||||
log.debug(f"Tenant {tenant_id} status: {tenant}")
|
||||
if tenant["state"] == expected_status:
|
||||
return True
|
||||
except Exception as e:
|
||||
log.debug(f"Tenant {tenant_id} status retrieval failure: {e}")
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
raise Exception(f"Tenant {tenant_id} did not become {expected_status} in {iterations} seconds")
|
||||
|
||||
@@ -58,6 +58,7 @@ def new_pageserver_service(
|
||||
pageserver_client = PageserverHttpClient(
|
||||
port=http_port,
|
||||
auth_token=None,
|
||||
is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled
|
||||
)
|
||||
try:
|
||||
pageserver_process = start_in_background(
|
||||
@@ -359,6 +360,7 @@ def test_tenant_relocation(
|
||||
new_pageserver_http = PageserverHttpClient(
|
||||
port=new_pageserver_http_port,
|
||||
auth_token=None,
|
||||
is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip,
|
||||
)
|
||||
|
||||
with new_pageserver_service(
|
||||
|
||||
@@ -1,7 +1,13 @@
|
||||
import time
|
||||
from typing import List, Tuple
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.types import Lsn
|
||||
|
||||
|
||||
@@ -38,8 +44,8 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
|
||||
Operate on single branch reading the tenants size after each transaction.
|
||||
"""
|
||||
|
||||
# Disable automatic gc and compaction.
|
||||
# The pitr_interval here is quite problematic, so we cannot really use it.
|
||||
# gc and compaction is not wanted automatically
|
||||
# the pitr_interval here is quite problematic, so we cannot really use it.
|
||||
# it'd have to be calibrated per test executing env.
|
||||
|
||||
# there was a bug which was hidden if the create table and first batch of
|
||||
@@ -47,7 +53,7 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
|
||||
# that there next_gc_cutoff could be smaller than initdb_lsn, which will
|
||||
# obviously lead to issues when calculating the size.
|
||||
gc_horizon = 0x30000
|
||||
neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
|
||||
neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='1h', gc_period='1h', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -156,7 +162,7 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
|
||||
|
||||
gc_horizon = 128 * 1024
|
||||
|
||||
neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
|
||||
neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='1h', gc_period='1h', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -250,7 +256,22 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
|
||||
assert size_after == size_after_thinning_branch
|
||||
|
||||
# teardown, delete branches, and the size should be going down
|
||||
http_client.timeline_delete(tenant_id, first_branch_timeline_id)
|
||||
deleted = False
|
||||
for _ in range(10):
|
||||
try:
|
||||
http_client.timeline_delete(tenant_id, first_branch_timeline_id)
|
||||
deleted = True
|
||||
break
|
||||
except PageserverApiException as e:
|
||||
# compaction is ok but just retry if this fails; related to #2442
|
||||
if "cannot lock compaction critical section" in str(e):
|
||||
# also ignore it in the log
|
||||
env.pageserver.allowed_errors.append(".*cannot lock compaction critical section.*")
|
||||
time.sleep(1)
|
||||
continue
|
||||
raise
|
||||
|
||||
assert deleted
|
||||
|
||||
size_after_deleting_first = http_client.tenant_size(tenant_id)
|
||||
assert size_after_deleting_first < size_after_thinning_branch
|
||||
|
||||
@@ -11,6 +11,13 @@ def get_only_element(l): # noqa: E741
|
||||
|
||||
# Test that gc and compaction tenant tasks start and stop correctly
|
||||
def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
|
||||
# The gc and compaction loops don't bother to watch for tenant state
|
||||
# changes while sleeping, so we use small periods to make this test
|
||||
# run faster. With default settings we'd have to wait longer for tasks
|
||||
# to notice state changes and shut down.
|
||||
# TODO fix this behavior in the pageserver
|
||||
tenant_config = "{gc_period = '1 s', compaction_period = '1 s'}"
|
||||
neon_env_builder.pageserver_config_override = f"tenant_config={tenant_config}"
|
||||
name = "test_tenant_tasks"
|
||||
env = neon_env_builder.init_start()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
@@ -161,17 +161,6 @@ def test_tenants_attached_after_download(
|
||||
##### Stop the pageserver, erase its layer file to force it being downloaded from S3
|
||||
env.postgres.stop_all()
|
||||
|
||||
sk_commit_lsns = [
|
||||
sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn
|
||||
for sk in env.safekeepers
|
||||
]
|
||||
log.info("wait for pageserver to process all the WAL")
|
||||
wait_for_last_record_lsn(client, tenant_id, timeline_id, max(sk_commit_lsns))
|
||||
log.info("wait for it to reach remote storage")
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(client, tenant_id, timeline_id, max(sk_commit_lsns))
|
||||
log.info("latest safekeeper_commit_lsn reached remote storage")
|
||||
|
||||
detail_before = client.timeline_detail(
|
||||
tenant_id, timeline_id, include_non_incremental_physical_size=True
|
||||
)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user