mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-01 04:20:39 +00:00
Compare commits
16 Commits
arpad/virt
...
arpad/virt
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d2314afaf6 | ||
|
|
0aed765c6d | ||
|
|
aa4763bed1 | ||
|
|
a13b0cc86b | ||
|
|
167342db13 | ||
|
|
dc6668d4cd | ||
|
|
df28e7afa0 | ||
|
|
9cffba255c | ||
|
|
8ef27a3bf8 | ||
|
|
afbebce051 | ||
|
|
486a7f6d63 | ||
|
|
3413937eb2 | ||
|
|
67f7e6f192 | ||
|
|
69d046fd7e | ||
|
|
353ebefefc | ||
|
|
17b3f4bc7d |
37
.github/workflows/approved-for-ci-run.yml
vendored
37
.github/workflows/approved-for-ci-run.yml
vendored
@@ -2,9 +2,7 @@ name: Handle `approved-for-ci-run` label
|
||||
# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
types:
|
||||
# Default types that triggers a workflow ([1]):
|
||||
# - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
|
||||
@@ -20,34 +18,29 @@ env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
permissions: write-all
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
|
||||
|
||||
jobs:
|
||||
remove-label:
|
||||
# Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
|
||||
# The PR should be reviewed and labelled manually again.
|
||||
|
||||
runs-on: [ ubuntu-latest ]
|
||||
|
||||
if: |
|
||||
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
|
||||
create-or-update-pr-for-ci-run:
|
||||
# Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
|
||||
create-branch:
|
||||
# Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
|
||||
|
||||
runs-on: [ ubuntu-latest ]
|
||||
|
||||
if: |
|
||||
github.event.action == 'labeled' &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
|
||||
@@ -60,19 +53,3 @@ jobs:
|
||||
- run: git checkout -b "ci-run/pr-${PR_NUMBER}"
|
||||
|
||||
- run: git push --force origin "ci-run/pr-${PR_NUMBER}"
|
||||
|
||||
- name: Create a Pull Request for CI run (if required)
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
HEAD="ci-run/pr-${PR_NUMBER}"
|
||||
BODY="This Pull Request was create automatically to run CI pipeline for #${PR_NUMBER}.\n\nPlease do not alter or merge/close it.\n\nFeel free to comment the original PR."
|
||||
|
||||
ALREADY_CREATED=$(gh pr --repo "${GITHUB_REPOSITORY}" list --head "${HEAD}" --base "main" --json "number" --jq '.[].number')
|
||||
if [ -z "${ALREADY_CREATED}" ]; then
|
||||
gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
|
||||
--body "${BODY}" \
|
||||
--head "${HEAD}" \
|
||||
--base "main" \
|
||||
--draft
|
||||
fi
|
||||
|
||||
66
.github/workflows/benchmarking.yml
vendored
66
.github/workflows/benchmarking.yml
vendored
@@ -117,7 +117,6 @@ jobs:
|
||||
outputs:
|
||||
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
|
||||
olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
|
||||
tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }}
|
||||
|
||||
steps:
|
||||
- name: Generate matrix for pgbench benchmark
|
||||
@@ -159,25 +158,6 @@ jobs:
|
||||
|
||||
echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Generate matrix for TPC-H benchmarks
|
||||
id: tpch-compare-matrix
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neon-captest-reuse"
|
||||
],
|
||||
"scale": [
|
||||
"10"
|
||||
]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
|
||||
pgbench-compare:
|
||||
needs: [ generate-matrices ]
|
||||
|
||||
@@ -253,11 +233,7 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERY="SELECT version();"
|
||||
if [ "${PLATFORM}" = "neon"* ]; then
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
|
||||
- name: Benchmark init
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -382,11 +358,7 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERY="SELECT version();"
|
||||
if [ "${PLATFORM}" = "neon"* ]; then
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
|
||||
- name: ClickBench benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -400,7 +372,6 @@ jobs:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
TEST_OLAP_SCALE: 10
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
@@ -427,7 +398,7 @@ jobs:
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }}
|
||||
matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
@@ -436,7 +407,6 @@ jobs:
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
TEST_OLAP_SCALE: ${{ matrix.scale }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
@@ -458,17 +428,18 @@ jobs:
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Get Connstring Secret Name
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-reuse)
|
||||
ENV_PLATFORM=CAPTEST_TPCH
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
ENV_PLATFORM=RDS_AURORA_TPCH
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }}
|
||||
;;
|
||||
rds-postgres)
|
||||
ENV_PLATFORM=RDS_AURORA_TPCH
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
@@ -476,21 +447,9 @@ jobs:
|
||||
;;
|
||||
esac
|
||||
|
||||
CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${SCALE}_CONNSTR"
|
||||
echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }}
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERY="SELECT version();"
|
||||
if [ "${PLATFORM}" = "neon"* ]; then
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
|
||||
- name: Run TPC-H benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -504,7 +463,6 @@ jobs:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
TEST_OLAP_SCALE: ${{ matrix.scale }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
@@ -576,11 +534,7 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERY="SELECT version();"
|
||||
if [ "${PLATFORM}" = "neon"* ]; then
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
|
||||
- name: Run user examples
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
|
||||
119
.github/workflows/build_and_test.yml
vendored
119
.github/workflows/build_and_test.yml
vendored
@@ -5,6 +5,7 @@ on:
|
||||
branches:
|
||||
- main
|
||||
- release
|
||||
- ci-run/pr-*
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
@@ -421,7 +422,7 @@ jobs:
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ regress-tests, coverage-report, benchmarks ]
|
||||
needs: [ regress-tests, benchmarks ]
|
||||
if: ${{ !cancelled() }}
|
||||
|
||||
steps:
|
||||
@@ -448,18 +449,12 @@ jobs:
|
||||
reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
|
||||
}
|
||||
|
||||
const coverage = {
|
||||
coverageUrl: "${{ needs.coverage-report.outputs.coverage-html }}",
|
||||
summaryJsonUrl: "${{ needs.coverage-report.outputs.coverage-json }}",
|
||||
}
|
||||
|
||||
const script = require("./scripts/comment-test-report.js")
|
||||
await script({
|
||||
github,
|
||||
context,
|
||||
fetch,
|
||||
report,
|
||||
coverage,
|
||||
})
|
||||
|
||||
coverage-report:
|
||||
@@ -472,15 +467,24 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug ]
|
||||
outputs:
|
||||
coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
|
||||
coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
fetch-depth: 1
|
||||
|
||||
# Disabled for now
|
||||
# - name: Restore cargo deps cache
|
||||
# id: cache_cargo
|
||||
# uses: actions/cache@v3
|
||||
# with:
|
||||
# path: |
|
||||
# ~/.cargo/registry/
|
||||
# !~/.cargo/registry/src
|
||||
# ~/.cargo/git/
|
||||
# target/
|
||||
# key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
- name: Get Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -523,45 +527,13 @@ jobs:
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build coverage report NEW
|
||||
id: upload-coverage-report-new
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
run: |
|
||||
BASELINE="$(git merge-base HEAD origin/main)"
|
||||
CURRENT="${COMMIT_SHA}"
|
||||
|
||||
cp /tmp/coverage/report/lcov.info ./${CURRENT}.info
|
||||
|
||||
GENHTML_ARGS="--ignore-errors path,unmapped,empty --synthesize-missing --demangle-cpp rustfilt --output-directory lcov-html ${CURRENT}.info"
|
||||
|
||||
# Use differential coverage if the baseline coverage exists.
|
||||
# It can be missing if the coverage repoer wasn't uploaded yet or tests has failed on BASELINE commit.
|
||||
if aws s3 cp --only-show-errors s3://${BUCKET}/code-coverage/${BASELINE}/lcov.info ./${BASELINE}.info; then
|
||||
git diff ${BASELINE} ${CURRENT} -- '*.rs' > baseline-current.diff
|
||||
|
||||
GENHTML_ARGS="--baseline-file ${BASELINE}.info --diff-file baseline-current.diff ${GENHTML_ARGS}"
|
||||
fi
|
||||
|
||||
genhtml ${GENHTML_ARGS}
|
||||
|
||||
aws s3 cp --only-show-errors --recursive ./lcov-html/ s3://${BUCKET}/code-coverage/${COMMIT_SHA}/lcov
|
||||
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/index.html
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
|
||||
echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
env:
|
||||
REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
|
||||
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
with:
|
||||
script: |
|
||||
const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env
|
||||
const { REPORT_URL, COMMIT_SHA } = process.env
|
||||
|
||||
await github.rest.repos.createCommitStatus({
|
||||
owner: context.repo.owner,
|
||||
@@ -572,15 +544,6 @@ jobs:
|
||||
context: 'Code coverage report',
|
||||
})
|
||||
|
||||
await github.rest.repos.createCommitStatus({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
sha: `${COMMIT_SHA}`,
|
||||
state: 'success',
|
||||
target_url: `${REPORT_URL_NEW}`,
|
||||
context: 'Code coverage report NEW',
|
||||
})
|
||||
|
||||
trigger-e2e-tests:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
@@ -936,13 +899,17 @@ jobs:
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
trigger-custom-extensions-build-and-wait:
|
||||
runs-on: ubuntu-latest
|
||||
build-private-extensions:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
needs: [ tag ]
|
||||
steps:
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
run: |
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||
REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"
|
||||
|
||||
curl -f -X POST \
|
||||
@@ -972,48 +939,10 @@ jobs:
|
||||
}
|
||||
}"
|
||||
|
||||
- name: Wait for extension build to finish
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
TIMEOUT=1800 # 30 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
|
||||
INTERVAL=15 # try each N seconds
|
||||
|
||||
last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context
|
||||
|
||||
for ((i=0; i <= $TIMEOUT; i+=$INTERVAL)); do
|
||||
sleep $INTERVAL
|
||||
|
||||
# Get statuses for the latest commit in the PR / branch
|
||||
gh api \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }}" > statuses.json
|
||||
|
||||
# Get the latest status for the "build-and-upload-extensions" context
|
||||
last_status=$(jq --raw-output '[.[] | select(.context == "build-and-upload-extensions")] | sort_by(.created_at)[-1].state' statuses.json)
|
||||
if [ "${last_status}" = "pending" ]; then
|
||||
# Extension build is still in progress.
|
||||
continue
|
||||
elif [ "${last_status}" = "success" ]; then
|
||||
# Extension build is successful.
|
||||
exit 0
|
||||
else
|
||||
# Status is neither "pending" nor "success", exit the loop and fail the job.
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
# Extension build failed, print `statuses.json` for debugging and fail the job.
|
||||
jq '.' statuses.json
|
||||
|
||||
echo >&2 "Status of extension build is '${last_status}' != 'success'"
|
||||
exit 1
|
||||
|
||||
deploy:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
needs: [ promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
|
||||
needs: [ promote-images, tag, regress-tests ]
|
||||
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
|
||||
1
.github/workflows/neon_extra_builds.yml
vendored
1
.github/workflows/neon_extra_builds.yml
vendored
@@ -4,6 +4,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- ci-run/pr-*
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
|
||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -2,7 +2,7 @@ name: Create Release Branch
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 7 * * 2'
|
||||
- cron: '0 10 * * 2'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
||||
39
Cargo.lock
generated
39
Cargo.lock
generated
@@ -213,6 +213,17 @@ dependencies = [
|
||||
"critical-section",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atty"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
|
||||
dependencies = [
|
||||
"hermit-abi 0.1.19",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.1.0"
|
||||
@@ -1057,8 +1068,6 @@ dependencies = [
|
||||
"comfy-table",
|
||||
"compute_api",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hyper",
|
||||
"nix 0.26.2",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
@@ -1074,7 +1083,6 @@ dependencies = [
|
||||
"storage_broker",
|
||||
"tar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"toml",
|
||||
"tracing",
|
||||
"url",
|
||||
@@ -1778,6 +1786,15 @@ version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.1.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.2.6"
|
||||
@@ -1808,16 +1825,6 @@ version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"
|
||||
|
||||
[[package]]
|
||||
name = "histogram"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hmac"
|
||||
version = "0.12.1"
|
||||
@@ -3731,7 +3738,7 @@ name = "s3_scrubber"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"atty",
|
||||
"aws-config",
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-http",
|
||||
@@ -3742,10 +3749,7 @@ dependencies = [
|
||||
"clap",
|
||||
"crc32c",
|
||||
"either",
|
||||
"futures-util",
|
||||
"hex",
|
||||
"histogram",
|
||||
"itertools",
|
||||
"pageserver",
|
||||
"rand",
|
||||
"reqwest",
|
||||
@@ -3755,7 +3759,6 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
"tracing-appender",
|
||||
"tracing-subscriber",
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
use std::convert::Infallible;
|
||||
use std::net::IpAddr;
|
||||
use std::net::Ipv6Addr;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
@@ -300,9 +298,7 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(port: u16, state: Arc<ComputeNode>) {
|
||||
// this usually binds to both IPv4 and IPv6 on linux
|
||||
// see e.g. https://github.com/rust-lang/rust/pull/34440
|
||||
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
|
||||
let addr = SocketAddr::from(([0, 0, 0, 0], port));
|
||||
|
||||
let make_service = make_service_fn(move |_conn| {
|
||||
let state = state.clone();
|
||||
|
||||
@@ -6,4 +6,4 @@ pub const DEFAULT_LOG_LEVEL: &str = "info";
|
||||
// https://www.postgresql.org/docs/15/auth-password.html
|
||||
//
|
||||
// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
|
||||
pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\tall\t\tmd5";
|
||||
pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
|
||||
|
||||
@@ -12,8 +12,6 @@ git-version.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
hex.workspace = true
|
||||
hyper.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
serde.workspace = true
|
||||
@@ -22,7 +20,6 @@ serde_with.workspace = true
|
||||
tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
toml.workspace = true
|
||||
tokio.workspace = true
|
||||
url.workspace = true
|
||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||
# instead, so that recompile times are better.
|
||||
|
||||
@@ -1,106 +0,0 @@
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
use anyhow::anyhow;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::{path::PathBuf, process::Child};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
pub struct AttachmentService {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "attachment_service";
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
pub pageserver_id: Option<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookResponse {
|
||||
pub gen: Option<u32>,
|
||||
}
|
||||
|
||||
impl AttachmentService {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = env.base_data_dir.join("attachments.json");
|
||||
|
||||
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||
// pageservers have control plane API set
|
||||
let listen_url = env.pageserver.control_plane_api.clone().unwrap();
|
||||
|
||||
let listen = format!(
|
||||
"{}:{}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
Self {
|
||||
env: env.clone(),
|
||||
path,
|
||||
listen,
|
||||
}
|
||||
}
|
||||
|
||||
fn pid_file(&self) -> PathBuf {
|
||||
self.env.base_data_dir.join("attachment_service.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> anyhow::Result<Child> {
|
||||
let path_str = self.path.to_string_lossy();
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
&self.env.attachment_service_bin(),
|
||||
["-l", &self.listen, "-p", &path_str],
|
||||
[],
|
||||
background_process::InitialPidFile::Create(&self.pid_file()),
|
||||
// TODO: a real status check
|
||||
|| Ok(true),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())
|
||||
}
|
||||
|
||||
/// Call into the attach_hook API, for use before handing out attachments to pageservers
|
||||
pub fn attach_hook(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
pageserver_id: NodeId,
|
||||
) -> anyhow::Result<Option<u32>> {
|
||||
use hyper::StatusCode;
|
||||
|
||||
let url = self
|
||||
.env
|
||||
.pageserver
|
||||
.control_plane_api
|
||||
.clone()
|
||||
.unwrap()
|
||||
.join("attach_hook")
|
||||
.unwrap();
|
||||
let client = reqwest::blocking::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client");
|
||||
|
||||
let request = AttachHookRequest {
|
||||
tenant_id,
|
||||
pageserver_id: Some(pageserver_id),
|
||||
};
|
||||
|
||||
let response = client.post(url).json(&request).send()?;
|
||||
if response.status() != StatusCode::OK {
|
||||
return Err(anyhow!("Unexpected status {}", response.status()));
|
||||
}
|
||||
|
||||
let response = response.json::<AttachHookResponse>()?;
|
||||
Ok(response.gen)
|
||||
}
|
||||
}
|
||||
@@ -1,273 +0,0 @@
|
||||
/// The attachment service mimics the aspects of the control plane API
|
||||
/// that are required for a pageserver to operate.
|
||||
///
|
||||
/// This enables running & testing pageservers without a full-blown
|
||||
/// deployment of the Neon cloud platform.
|
||||
///
|
||||
use anyhow::anyhow;
|
||||
use clap::Parser;
|
||||
use hex::FromHex;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use utils::logging::{self, LogFormat};
|
||||
|
||||
use utils::{
|
||||
http::{
|
||||
endpoint::{self},
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
id::{NodeId, TenantId},
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
use pageserver_api::control_api::{
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
|
||||
ValidateResponseTenant,
|
||||
};
|
||||
|
||||
use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
#[command(arg_required_else_help(true))]
|
||||
struct Cli {
|
||||
/// Host and port to listen on, like `127.0.0.1:1234`
|
||||
#[arg(short, long)]
|
||||
listen: std::net::SocketAddr,
|
||||
|
||||
/// Path to the .json file to store state (will be created if it doesn't exist)
|
||||
#[arg(short, long)]
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
// The persistent state of each Tenant
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
struct TenantState {
|
||||
// Currently attached pageserver
|
||||
pageserver: Option<NodeId>,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching
|
||||
generation: u32,
|
||||
}
|
||||
|
||||
fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
V: Clone + Serialize,
|
||||
{
|
||||
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
|
||||
|
||||
transformed
|
||||
.collect::<HashMap<String, V>>()
|
||||
.serialize(serializer)
|
||||
}
|
||||
|
||||
fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
V: Deserialize<'de>,
|
||||
{
|
||||
let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
|
||||
hex_map
|
||||
.into_iter()
|
||||
.map(|(k, v)| {
|
||||
TenantId::from_hex(k)
|
||||
.map(|k| (k, v))
|
||||
.map_err(serde::de::Error::custom)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// Top level state available to all HTTP handlers
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PersistentState {
|
||||
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
|
||||
tenants: HashMap<TenantId, TenantState>,
|
||||
|
||||
#[serde(skip)]
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl PersistentState {
|
||||
async fn save(&self) -> anyhow::Result<()> {
|
||||
let bytes = serde_json::to_vec(self)?;
|
||||
tokio::fs::write(&self.path, &bytes).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn load(path: &Path) -> anyhow::Result<Self> {
|
||||
let bytes = tokio::fs::read(path).await?;
|
||||
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
|
||||
decoded.path = path.to_owned();
|
||||
Ok(decoded)
|
||||
}
|
||||
|
||||
async fn load_or_new(path: &Path) -> Self {
|
||||
match Self::load(path).await {
|
||||
Ok(s) => {
|
||||
tracing::info!("Loaded state file at {}", path.display());
|
||||
s
|
||||
}
|
||||
Err(e)
|
||||
if e.downcast_ref::<std::io::Error>()
|
||||
.map(|e| e.kind() == std::io::ErrorKind::NotFound)
|
||||
.unwrap_or(false) =>
|
||||
{
|
||||
tracing::info!("Will create state file at {}", path.display());
|
||||
Self {
|
||||
tenants: HashMap::new(),
|
||||
path: path.to_owned(),
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
struct State {
|
||||
inner: Arc<tokio::sync::RwLock<PersistentState>>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn new(persistent_state: PersistentState) -> State {
|
||||
Self {
|
||||
inner: Arc::new(tokio::sync::RwLock::new(persistent_state)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn get_state(request: &Request<Body>) -> &State {
|
||||
request
|
||||
.data::<Arc<State>>()
|
||||
.expect("unknown state type")
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
/// Pageserver calls into this on startup, to learn which tenants it should attach
|
||||
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req).inner.clone();
|
||||
let mut locked = state.write().await;
|
||||
|
||||
let mut response = ReAttachResponse {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
for (t, state) in &mut locked.tenants {
|
||||
if state.pageserver == Some(reattach_req.node_id) {
|
||||
state.generation += 1;
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: *t,
|
||||
generation: state.generation,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Pageserver calls into this before doing deletions, to confirm that it still
|
||||
/// holds the latest generation for the tenants with deletions enqueued
|
||||
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
|
||||
|
||||
let locked = get_state(&req).inner.read().await;
|
||||
|
||||
let mut response = ValidateResponse {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
|
||||
for req_tenant in validate_req.tenants {
|
||||
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
|
||||
let valid = tenant_state.generation == req_tenant.gen;
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: req_tenant.id,
|
||||
valid,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
|
||||
/// (in the real control plane this is unnecessary, because the same program is managing
|
||||
/// generation numbers and doing attachments).
|
||||
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req).inner.clone();
|
||||
let mut locked = state.write().await;
|
||||
|
||||
let tenant_state = locked
|
||||
.tenants
|
||||
.entry(attach_req.tenant_id)
|
||||
.or_insert_with(|| TenantState {
|
||||
pageserver: attach_req.pageserver_id,
|
||||
generation: 0,
|
||||
});
|
||||
|
||||
if attach_req.pageserver_id.is_some() {
|
||||
tenant_state.generation += 1;
|
||||
}
|
||||
let generation = tenant_state.generation;
|
||||
|
||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
AttachHookResponse {
|
||||
gen: attach_req.pageserver_id.map(|_| generation),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
endpoint::make_router()
|
||||
.data(Arc::new(State::new(persistent_state)))
|
||||
.post("/re-attach", handle_re_attach)
|
||||
.post("/validate", handle_validate)
|
||||
.post("/attach_hook", handle_attach_hook)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
)?;
|
||||
|
||||
let args = Cli::parse();
|
||||
tracing::info!(
|
||||
"Starting, state at {}, listening on {}",
|
||||
args.path.to_string_lossy(),
|
||||
args.listen
|
||||
);
|
||||
|
||||
let persistent_state = PersistentState::load_or_new(&args.path).await;
|
||||
|
||||
let http_listener = tcp_listener::bind(args.listen)?;
|
||||
let router = make_router(persistent_state)
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
|
||||
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
server.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -8,7 +8,6 @@
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::attachment_service::AttachmentService;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
@@ -44,8 +43,6 @@ project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "15";
|
||||
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
|
||||
|
||||
fn default_conf() -> String {
|
||||
format!(
|
||||
r#"
|
||||
@@ -59,13 +56,11 @@ listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
|
||||
listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
|
||||
pg_auth_type = '{trust_auth}'
|
||||
http_auth_type = '{trust_auth}'
|
||||
control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
|
||||
|
||||
[[safekeepers]]
|
||||
id = {DEFAULT_SAFEKEEPER_ID}
|
||||
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
|
||||
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
|
||||
|
||||
"#,
|
||||
trust_auth = AuthType::Trust,
|
||||
)
|
||||
@@ -112,7 +107,6 @@ fn main() -> Result<()> {
|
||||
"start" => handle_start_all(sub_args, &env),
|
||||
"stop" => handle_stop_all(sub_args, &env),
|
||||
"pageserver" => handle_pageserver(sub_args, &env),
|
||||
"attachment_service" => handle_attachment_service(sub_args, &env),
|
||||
"safekeeper" => handle_safekeeper(sub_args, &env),
|
||||
"endpoint" => handle_endpoint(sub_args, &env),
|
||||
"pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
|
||||
@@ -348,25 +342,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
}
|
||||
}
|
||||
Some(("create", create_match)) => {
|
||||
let initial_tenant_id = parse_tenant_id(create_match)?;
|
||||
let tenant_conf: HashMap<_, _> = create_match
|
||||
.get_many::<String>("config")
|
||||
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
// If tenant ID was not specified, generate one
|
||||
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
|
||||
|
||||
let generation = if env.pageserver.control_plane_api.is_some() {
|
||||
// We must register the tenant with the attachment service, so
|
||||
// that when the pageserver restarts, it will be re-attached.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service.attach_hook(tenant_id, env.pageserver.id)?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
|
||||
println!("tenant {tenant_id} successfully created on the pageserver");
|
||||
let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?;
|
||||
println!("tenant {new_tenant_id} successfully created on the pageserver");
|
||||
|
||||
// Create an initial timeline for the new tenant
|
||||
let new_timeline_id = parse_timeline_id(create_match)?;
|
||||
@@ -376,7 +358,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let timeline_info = pageserver.timeline_create(
|
||||
tenant_id,
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
None,
|
||||
None,
|
||||
@@ -387,17 +369,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
|
||||
env.register_branch_mapping(
|
||||
DEFAULT_BRANCH_NAME.to_string(),
|
||||
tenant_id,
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
)?;
|
||||
|
||||
println!(
|
||||
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
|
||||
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
|
||||
);
|
||||
|
||||
if create_match.get_flag("set-default") {
|
||||
println!("Setting tenant {tenant_id} as a default one");
|
||||
env.default_tenant_id = Some(tenant_id);
|
||||
println!("Setting tenant {new_tenant_id} as a default one");
|
||||
env.default_tenant_id = Some(new_tenant_id);
|
||||
}
|
||||
}
|
||||
Some(("set-default", set_default_match)) => {
|
||||
@@ -835,33 +817,6 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let svc = AttachmentService::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", _start_match)) => {
|
||||
if let Err(e) = svc.start() {
|
||||
eprintln!("start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Some(("stop", stop_match)) => {
|
||||
let immediate = stop_match
|
||||
.get_one::<String>("stop-mode")
|
||||
.map(|s| s.as_str())
|
||||
== Some("immediate");
|
||||
|
||||
if let Err(e) = svc.stop(immediate) {
|
||||
eprintln!("stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
|
||||
None => bail!("no attachment_service subcommand provided"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNode> {
|
||||
if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
|
||||
Ok(SafekeeperNode::from_env(env, node))
|
||||
@@ -942,16 +897,6 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
|
||||
|
||||
broker::start_broker_process(env)?;
|
||||
|
||||
// Only start the attachment service if the pageserver is configured to need it
|
||||
if env.pageserver.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.start() {
|
||||
eprintln!("attachment_service start failed: {:#}", e);
|
||||
try_stop_all(env, true);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
let pageserver = PageServerNode::from_env(env);
|
||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
|
||||
eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
|
||||
@@ -1010,13 +955,6 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
if let Err(e) = broker::stop_broker_process(env) {
|
||||
eprintln!("neon broker stop failed: {e:#}");
|
||||
}
|
||||
|
||||
if env.pageserver.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.stop(immediate) {
|
||||
eprintln!("attachment service stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn cli() -> Command {
|
||||
@@ -1200,14 +1138,6 @@ fn cli() -> Command {
|
||||
.arg(stop_mode_arg.clone()))
|
||||
.subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("attachment_service")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage attachment_service")
|
||||
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
||||
.subcommand(Command::new("stop").about("Stop local pageserver")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("safekeeper")
|
||||
.arg_required_else_help(true)
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
// local installations.
|
||||
//
|
||||
|
||||
pub mod attachment_service;
|
||||
mod background_process;
|
||||
pub mod broker;
|
||||
pub mod endpoint;
|
||||
|
||||
@@ -118,9 +118,6 @@ pub struct PageServerConf {
|
||||
// auth type used for the PG and HTTP ports
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
|
||||
// Control plane location
|
||||
pub control_plane_api: Option<Url>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -131,7 +128,6 @@ impl Default for PageServerConf {
|
||||
listen_http_addr: String::new(),
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
control_plane_api: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -206,10 +202,6 @@ impl LocalEnv {
|
||||
self.neon_distrib_dir.join("pageserver")
|
||||
}
|
||||
|
||||
pub fn attachment_service_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("attachment_service")
|
||||
}
|
||||
|
||||
pub fn safekeeper_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("safekeeper")
|
||||
}
|
||||
|
||||
@@ -126,13 +126,6 @@ impl PageServerNode {
|
||||
broker_endpoint_param,
|
||||
];
|
||||
|
||||
if let Some(control_plane_api) = &self.env.pageserver.control_plane_api {
|
||||
overrides.push(format!(
|
||||
"control_plane_api='{}'",
|
||||
control_plane_api.as_str()
|
||||
));
|
||||
}
|
||||
|
||||
if self.env.pageserver.http_auth_type != AuthType::Trust
|
||||
|| self.env.pageserver.pg_auth_type != AuthType::Trust
|
||||
{
|
||||
@@ -323,8 +316,7 @@ impl PageServerNode {
|
||||
|
||||
pub fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: TenantId,
|
||||
generation: Option<u32>,
|
||||
new_tenant_id: Option<TenantId>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<TenantId> {
|
||||
let mut settings = settings.clone();
|
||||
@@ -390,9 +382,11 @@ impl PageServerNode {
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
};
|
||||
|
||||
// If tenant ID was not specified, generate one
|
||||
let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
|
||||
|
||||
let request = models::TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
generation,
|
||||
config,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
|
||||
@@ -1,281 +0,0 @@
|
||||
|
||||
# Crash-Consistent Layer Map Updates By Leveraging `index_part.json`
|
||||
|
||||
* Created on: Aug 23, 2023
|
||||
* Author: Christian Schwarz
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the `index_part.json` in remote storage.
|
||||
Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold.
|
||||
|
||||
## Motivation
|
||||
|
||||
### Background
|
||||
|
||||
We can currently easily make complex, atomic updates to the layer map by means of an RwLock.
|
||||
If we crash or restart pageserver, we reconstruct the layer map from:
|
||||
1. local timeline directory contents
|
||||
2. remote `index_part.json` contents.
|
||||
|
||||
The function that is responsible for this is called `Timeline::load_layer_map()`.
|
||||
The reconciliation process's behavior is the following:
|
||||
* local-only files will become part of the layer map as local-only layers and rescheduled for upload
|
||||
* For a file name that, by its name, is present locally and in the remote `index_part.json`, but where the local file has a different size (future: checksum) than the remote file, we will delete the local file and leave the remote file as a `RemoteLayer` in the layer map.
|
||||
|
||||
### The Problem
|
||||
|
||||
There are are cases where we need to make an atomic update to the layer map that involves **more than one layer**.
|
||||
The best example is compaction, where we need to insert the L1 layers generated from the L0 layers, and remove the L0 layers.
|
||||
As stated above, making the update to the layer map in atomic way is trivial.
|
||||
But, there is no system call API to make an atomic update to a directory that involves more than one file rename and deletion.
|
||||
Currently, we issue the system calls one by one and hope we don't crash.
|
||||
|
||||
What happens if we crash and restart in the middle of that system call sequence?
|
||||
We will reconstruct the layer map according to the reconciliation process, taking as input whatever transitory state the timeline directory ended up in.
|
||||
|
||||
We cannot roll back or complete the timeline directory update during which we crashed, because we keep no record of the changes we plan to make.
|
||||
|
||||
### Problem's Implications For Compaction
|
||||
|
||||
The implications of the above are primarily problematic for compaction.
|
||||
Specifically, the part of it that compacts L0 layers into L1 layers.
|
||||
|
||||
Remember that compaction takes a set of L0 layers and reshuffles the delta records in them into L1 layer files.
|
||||
Once the L1 layer files are written to disk, it atomically removes the L0 layers from the layer map and adds the L1 layers to the layer map.
|
||||
It then deletes the L0 layers locally, and schedules an upload of the L1 layers and and updated index part.
|
||||
|
||||
If we crash before deleting L0s, but after writing out L1s, the next compaction after restart will re-digest the L0s and produce new L1s.
|
||||
This means the compaction after restart will **overwrite** the previously written L1s.
|
||||
Currently we also schedule an S3 upload of the overwritten L1.
|
||||
|
||||
If the compaction algorithm doesn't change between the two compaction runs, is deterministic, and uses the same set of L0s as input, then the second run will produce identical L1s and the overwrites will go unnoticed.
|
||||
|
||||
*However*:
|
||||
1. the file size of the overwritten L1s may not be identical, and
|
||||
2. the bit pattern of the overwritten L1s may not be identical, and,
|
||||
3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
|
||||
|
||||
The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted).
|
||||
|
||||
For example, if an unresponsive node A becomes active again after control plane has relocated the tenant to a new node B, the node A may overwrite some L1s.
|
||||
But node B based its world view on the version of node A's `index_part.json` from _before_ the overwrite.
|
||||
That earlier `index_part.json`` contained the file size of the pre-overwrite L1.
|
||||
If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1.
|
||||
Effectively, the data in the L1 has become inaccessible to node B.
|
||||
If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem.
|
||||
|
||||
If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems.
|
||||
|
||||
In case of (1) and (2), where we know that the logical content of the layers is still the same, we can recover by manually patching the `index_part.json` of the new node to the overwritten L1's file size / checksum.
|
||||
|
||||
But if (3) ever happens, the logical content may be different, and, we could have truly lost data.
|
||||
|
||||
Given the above considerations, we should avoid making correctness of split-brain protection dependent on overwrites preserving _logical_ layer file contents.
|
||||
**It is a much cleaner separation of concerns to require that layer files are truly immutable in S3, i.e., PUT once and then only DELETEd, never overwritten (overPUTted).**
|
||||
|
||||
## Design
|
||||
|
||||
Instead of reconciling a layer map from local timeline directory contents and remote index part, this RFC proposes to view the remote index part as authoritative during timeline load.
|
||||
Local layer files will be recognized if they match what's listed in remote index part, and removed otherwise.
|
||||
|
||||
During **timeline load**, the only thing that matters is the remote index part content.
|
||||
Essentially, timeline load becomes much like attach, except we don't need to prefix-list the remote timelines.
|
||||
The local timeline dir's `metadata` file does not matter.
|
||||
The layer files in the local timeline dir are seen as a nice-to-have cache of layer files that are in the remote index part.
|
||||
Any layer files in the local timeline dir that aren't in the remote index part are removed during startup.
|
||||
The `Timeline::load_layer_map()` no longer "merges" local timeline dir contents with the remote index part.
|
||||
Instead, it treats the remote index part as the authoritative layer map.
|
||||
If the local timeline dir contains a layer that is in the remote index part, that's nice, and we'll re-use it if file size (and in the future, check sum) match what's stated in the index part.
|
||||
If it doesn't match, we remove the file from the local timeline dir.
|
||||
|
||||
After load, **at runtime**, nothing changes compared to what we did before this RFC.
|
||||
The procedure for single- and multi-object changes is reproduced here for reference:
|
||||
* For any new layers that the change adds:
|
||||
* Write them to a temporary location.
|
||||
* While holding layer map lock:
|
||||
* Move them to the final location.
|
||||
* Insert into layer map.
|
||||
* Make the S3 changes.
|
||||
We won't reproduce the remote timeline client method calls here because these are subject to change.
|
||||
Instead we reproduce the sequence of s3 changes that must result for a given single-/multi-object change:
|
||||
* PUT layer files inserted by the change.
|
||||
* PUT an index part that has insertions and deletions of the change.
|
||||
* DELETE the layer files that are deleted by the change.
|
||||
|
||||
Note that it is safe for the DELETE to be deferred arbitrarily.
|
||||
* If it never happens, we leak the object, but, that's not a correctness concern.
|
||||
* As of #4938, we don't schedule the remote timeline client operation for deletion immediately, but, only when we drop the `LayerInner`.
|
||||
* With the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919), the deletions will be written to deletion queue for processing when it's safe to do so (see the RFC for details).
|
||||
|
||||
## How This Solves The Problem
|
||||
|
||||
If we crash before we've finished the S3 changes, then timeline load will reset layer map to the state that's in the S3 index part.
|
||||
The S3 change sequence above is obviously crash-consistent.
|
||||
If we crash before the index part PUT, then we leak the inserted layer files to S3.
|
||||
If we crash after the index part PUT, we leak the to-be-DELETEd layer files to S3.
|
||||
Leaking is fine, it's a pre-existing condition and not addressed in this RFC.
|
||||
|
||||
Multi-object changes that previously created and removed files in timeline dir are now atomic because the layer map updates are atomic and crash consistent:
|
||||
* atomic layer map update at runtime, currently by using an RwLock in write mode
|
||||
* atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic
|
||||
* local timeline dir state:
|
||||
* irrelevant for layer map content => irrelevant for atomic updates / crash consistency
|
||||
* if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them
|
||||
* if we crash before index part PUT, local layer files will be deleted
|
||||
|
||||
## Trade-Offs
|
||||
|
||||
### Fundamental
|
||||
|
||||
If we crash before finishing the index part PUT, we lose all the work that hasn't reached the S3 `index_part.json`:
|
||||
* wal ingest: we lose not-yet-uploaded L0s; load on the **safekeepers** + work for pageserver
|
||||
* compaction: we lose the entire compaction iteration work; need to re-do it again
|
||||
* gc: no change to what we have today
|
||||
|
||||
If the work is still deemed necessary after restart, the restarted restarted pageserver will re-do this work.
|
||||
The amount of work to be re-do is capped to the lag of S3 changes to the local changes.
|
||||
Assuming upload queue allows for unlimited queue depth (that's what it does today), this means:
|
||||
* on-demand downloads that were needed to do the work: are likely still present, not lost
|
||||
* wal ingest: currently unbounded
|
||||
* L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()`
|
||||
* Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M.
|
||||
* In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
|
||||
* image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))`
|
||||
* I have no intuition how expensive / long-running it is in reality.
|
||||
* gc: `update_gc_info`` work (not substantial, AFAIK)
|
||||
|
||||
To limit the amount of lost upload work, and ingest work, we can limit the upload queue depth (see suggestions in the next sub-section).
|
||||
However, to limit the amount of lost CPU work, we would need a way to make make the compaction/image-layer-generation algorithms interruptible & resumable.
|
||||
We aren't there yet, the need for it is tracked by ([#4580](https://github.com/neondatabase/neon/issues/4580)).
|
||||
However, this RFC is not constraining the design space either.
|
||||
|
||||
### Practical
|
||||
|
||||
#### Pageserver Restarts
|
||||
|
||||
Pageserver crashes are very rare ; it would likely be acceptable to re-do the lost work in that case.
|
||||
However, regular pageserver restart happen frequently, e.g., during weekly deploys.
|
||||
|
||||
In general, pageserver restart faces the problem of tenants that "take too long" to shut down.
|
||||
They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down.
|
||||
We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file).
|
||||
A longer budget would expose tenants that are done early to a longer downtime.
|
||||
A short budget would risk throwing away more work that'd have to be re-done after restart.
|
||||
|
||||
In the context of this RFC, killing the process would mean losing the work that hasn't made it to S3.
|
||||
We can mitigate this problem as follows:
|
||||
0. initially, by accepting that we need to do the work again
|
||||
1. short-term, introducing measures to cap the amount of in-flight work:
|
||||
|
||||
- cap upload queue length, use backpressure to slow down compaction
|
||||
- disabling compaction/image-layer-generation X minutes before `systemctl restart pageserver`
|
||||
- introducing a read-only shutdown state for tenants that are fast to shut down;
|
||||
that state would be equivalent to the state of a tenant in hot standby / readonly mode.
|
||||
|
||||
2. mid term, by not restarting pageserver in place, but using [*seamless tenant migration*](https://github.com/neondatabase/neon/pull/5029) to drain a pageserver's tenants before we restart it.
|
||||
|
||||
#### `disk_consistent_lsn` can go backwards
|
||||
|
||||
`disk_consistent_lsn` can go backwards across restarts if we crash before we've finished the index part PUT.
|
||||
Nobody should care about it, because the only thing that matters is `remote_consistent_lsn`.
|
||||
Compute certainly doesn't care about `disk_consistent_lsn`.
|
||||
|
||||
|
||||
## Side-Effects Of This Design
|
||||
|
||||
* local `metadata` is basically reduced to a cache of which timelines exist for this tenant; i.e., we can avoid a `ListObjects` requests for a tenant's timelines during tenant load.
|
||||
|
||||
## Limitations
|
||||
|
||||
Multi-object changes that span multiple timelines aren't covered by this RFC.
|
||||
That's fine because we currently don't need them, as evidenced by the absence
|
||||
of a Pageserver operation that holds multiple timelines' layer map lock at a time.
|
||||
|
||||
## Impacted components
|
||||
|
||||
Primarily pageservers.
|
||||
|
||||
Safekeepers will experience more load when we need to re-ingest WAL because we've thrown away work.
|
||||
No changes to safekeepers are needed.
|
||||
|
||||
## Alternatives considered
|
||||
|
||||
### Alternative 1: WAL
|
||||
|
||||
We could have a local WAL for timeline dir changes, as proposed here https://github.com/neondatabase/neon/issues/4418 and partially implemented here https://github.com/neondatabase/neon/pull/4422 .
|
||||
The WAL would be used to
|
||||
1. make multi-object changes atomic
|
||||
2. replace `reconcile_with_remote()` reconciliation: scheduling of layer upload would be part of WAL replay.
|
||||
|
||||
The WAL is appealing in a local-first world, but, it's much more complex than the design described above:
|
||||
* New on-disk state to get right.
|
||||
* Forward- and backward-compatibility development costs in the future.
|
||||
|
||||
### Alternative 2: Flow Everything Through `index_part.json`
|
||||
|
||||
We could have gone to the other extreme and **only** update the layer map whenever we've PUT `index_part.json`.
|
||||
I.e., layer map would always be the last-persisted S3 state.
|
||||
That's axiomatically beautiful, not least because it fully separates the layer file production and consumption path (=> [layer file spreading proposal](https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=4)).
|
||||
And it might make hot standbys / read-only pageservers less of a special case in the future.
|
||||
|
||||
But, I have some uncertainties with regard to WAL ingestion, because it needs to be able to do some reads for the logical size feedback to safekeepers.
|
||||
|
||||
And it's silly that we wouldn't be able to use the results of compaction or image layer generation before we're done with the upload.
|
||||
|
||||
Lastly, a temporarily clogged-up upload queue (e.g. S3 is down) shouldn't immediately render ingestion unavailable.
|
||||
|
||||
### Alternative 3: Sequence Numbers For Layers
|
||||
|
||||
Instead of what's proposed in this RFC, we could use unique numbers to identify layer files:
|
||||
|
||||
```
|
||||
# before
|
||||
tenants/$tenant/timelines/$timeline/$key_and_lsn_range
|
||||
# after
|
||||
tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range
|
||||
```
|
||||
|
||||
To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`.
|
||||
|
||||
This alternative does not solve atomic layer map updates.
|
||||
In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers.
|
||||
In fact, this alternative makes it worse because the data is now duplicated in the not-overwritten and overwritten L1 layer files.
|
||||
We'd need to write a deduplication pass that checks if perfectly overlapping layers have identical contents.
|
||||
|
||||
However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC.
|
||||
|
||||
So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3).
|
||||
But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute.
|
||||
The proposed design in this RFC addresses both.
|
||||
|
||||
So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top.
|
||||
That way, we avoid a phase where the crash-during-compaction problem is accute.
|
||||
|
||||
## Related issues
|
||||
|
||||
- https://github.com/neondatabase/neon/issues/4749
|
||||
- https://github.com/neondatabase/neon/issues/4418
|
||||
- https://github.com/neondatabase/neon/pull/4422
|
||||
- https://github.com/neondatabase/neon/issues/5077
|
||||
- https://github.com/neondatabase/neon/issues/4088
|
||||
- (re)resolutions:
|
||||
- https://github.com/neondatabase/neon/pull/4696
|
||||
- https://github.com/neondatabase/neon/pull/4094
|
||||
- https://neondb.slack.com/archives/C033QLM5P7D/p1682519017949719
|
||||
|
||||
Note that the test case introduced in https://github.com/neondatabase/neon/pull/4696/files#diff-13114949d1deb49ae394405d4c49558adad91150ba8a34004133653a8a5aeb76 will produce L1s with the same logical content, but, as outlined in the last paragraph of the _Problem Statement_ section above, we don't want to make that assumption in order to fix the problem.
|
||||
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
1. Remove support for `remote_storage=None`, because we now rely on the existence of an index part.
|
||||
|
||||
- The nasty part here is to fix all the tests that fiddle with the local timeline directory.
|
||||
Possibly they are just irrelevant with this change, but, each case will require inspection.
|
||||
|
||||
2. Implement the design above.
|
||||
|
||||
- Initially, ship without the mitigations for restart and accept we will do some work twice.
|
||||
- Measure the impact and implement one of the mitigations.
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
//! Types in this file are for pageserver's upward-facing API calls to the control plane,
|
||||
//! required for acquiring and validating tenant generation numbers.
|
||||
//!
|
||||
//! See docs/rfcs/025-generation-numbers.md
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponseTenant {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
pub generation: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponse {
|
||||
pub tenants: Vec<ReAttachResponseTenant>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateRequestTenant {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
pub gen: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateRequest {
|
||||
pub tenants: Vec<ValidateRequestTenant>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateResponse {
|
||||
pub tenants: Vec<ValidateResponseTenant>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateResponseTenant {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
pub valid: bool,
|
||||
}
|
||||
@@ -1,7 +1,6 @@
|
||||
use const_format::formatcp;
|
||||
|
||||
/// Public API types
|
||||
pub mod control_api;
|
||||
pub mod models;
|
||||
pub mod reltag;
|
||||
|
||||
|
||||
@@ -194,22 +194,10 @@ pub struct TimelineCreateRequest {
|
||||
pub struct TenantCreateRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub new_tenant_id: TenantId,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLoadRequest {
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
}
|
||||
|
||||
impl std::ops::Deref for TenantCreateRequest {
|
||||
type Target = TenantConfig;
|
||||
|
||||
@@ -253,6 +241,15 @@ pub struct StatusResponse {
|
||||
pub id: NodeId,
|
||||
}
|
||||
|
||||
impl TenantCreateRequest {
|
||||
pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest {
|
||||
TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
config: TenantConfig::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
@@ -296,11 +293,9 @@ impl TenantConfigRequest {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TenantAttachRequest {
|
||||
pub config: TenantAttachConfig,
|
||||
#[serde(default)]
|
||||
pub generation: Option<u32>,
|
||||
}
|
||||
|
||||
/// Newtype to enforce deny_unknown_fields on TenantConfig for
|
||||
|
||||
@@ -148,55 +148,21 @@ impl RemoteStorage for LocalFs {
|
||||
Some(folder) => folder.with_base(&self.storage_root),
|
||||
None => self.storage_root.clone(),
|
||||
};
|
||||
|
||||
// If we were given a directory, we may use it as our starting point.
|
||||
// Otherwise, we must go up to the parent directory. This is because
|
||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||
// the local filesystem we need a directory to start calling read_dir on.
|
||||
let mut initial_dir = full_path.clone();
|
||||
match fs::metadata(full_path.clone()).await {
|
||||
Ok(meta) => {
|
||||
if !meta.is_dir() {
|
||||
// It's not a directory: strip back to the parent
|
||||
initial_dir.pop();
|
||||
}
|
||||
}
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
// It's not a file that exists: strip the prefix back to the parent directory
|
||||
initial_dir.pop();
|
||||
}
|
||||
Err(e) => {
|
||||
// Unexpected I/O error
|
||||
anyhow::bail!(e)
|
||||
}
|
||||
}
|
||||
|
||||
// Note that PathBuf starts_with only considers full path segments, but
|
||||
// object prefixes are arbitrary strings, so we need the strings for doing
|
||||
// starts_with later.
|
||||
let prefix = full_path.to_string_lossy();
|
||||
|
||||
let mut files = vec![];
|
||||
let mut directory_queue = vec![initial_dir.clone()];
|
||||
let mut directory_queue = vec![full_path.clone()];
|
||||
|
||||
while let Some(cur_folder) = directory_queue.pop() {
|
||||
let mut entries = fs::read_dir(cur_folder.clone()).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let file_name: PathBuf = entry.file_name().into();
|
||||
let full_file_name = cur_folder.clone().join(&file_name);
|
||||
if full_file_name
|
||||
.to_str()
|
||||
.map(|s| s.starts_with(prefix.as_ref()))
|
||||
.unwrap_or(false)
|
||||
{
|
||||
let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
|
||||
files.push(file_remote_path.clone());
|
||||
if full_file_name.is_dir() {
|
||||
directory_queue.push(full_file_name);
|
||||
}
|
||||
let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
|
||||
files.push(file_remote_path.clone());
|
||||
if full_file_name.is_dir() {
|
||||
directory_queue.push(full_file_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
|
||||
@@ -53,7 +53,6 @@ impl Generation {
|
||||
matches!(self, Self::None)
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
pub fn get_suffix(&self) -> String {
|
||||
match self {
|
||||
Self::Valid(v) => {
|
||||
@@ -65,30 +64,6 @@ impl Generation {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `suffix` is the part after "-" in a key
|
||||
///
|
||||
/// Returns None if parsing was unsuccessful
|
||||
pub fn parse_suffix(suffix: &str) -> Option<Generation> {
|
||||
u32::from_str_radix(suffix, 16).map(Generation::new).ok()
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
pub fn previous(&self) -> Generation {
|
||||
match self {
|
||||
Self::Valid(n) => {
|
||||
if *n == 0 {
|
||||
// Since a tenant may be upgraded from a pre-generations state, interpret the "previous" generation
|
||||
// to 0 as being "no generation".
|
||||
Self::None
|
||||
} else {
|
||||
Self::Valid(n - 1)
|
||||
}
|
||||
}
|
||||
Self::None => Self::None,
|
||||
Self::Broken => panic!("Attempted to use a broken generation"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Generation {
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.
|
||||
|
||||
use anyhow::Result;
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::ops::Range;
|
||||
@@ -97,7 +96,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
|
||||
|
||||
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
|
||||
async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
let file = FileBlockReader::new(VirtualFile::open(path).await?);
|
||||
let file = FileBlockReader::new(VirtualFile::open(path)?);
|
||||
let summary_blk = file.read_blk(0).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
@@ -143,12 +142,12 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
let mut total_delta_layers = 0usize;
|
||||
let mut total_image_layers = 0usize;
|
||||
let mut total_excess_layers = 0usize;
|
||||
for tenant in fs::read_dir(storage_path.join(TENANTS_SEGMENT_NAME))? {
|
||||
for tenant in fs::read_dir(storage_path.join("tenants"))? {
|
||||
let tenant = tenant?;
|
||||
if !tenant.file_type()?.is_dir() {
|
||||
continue;
|
||||
}
|
||||
for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
|
||||
for timeline in fs::read_dir(tenant.path().join("timelines"))? {
|
||||
let timeline = timeline?;
|
||||
if !timeline.file_type()?.is_dir() {
|
||||
continue;
|
||||
|
||||
@@ -5,7 +5,6 @@ use clap::Subcommand;
|
||||
use pageserver::tenant::block_io::BlockCursor;
|
||||
use pageserver::tenant::disk_btree::DiskBtreeReader;
|
||||
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use pageserver::{page_cache, virtual_file};
|
||||
use pageserver::{
|
||||
repository::{Key, KEY_SIZE},
|
||||
@@ -48,7 +47,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
let path = path.as_ref();
|
||||
virtual_file::init(10);
|
||||
page_cache::init(100);
|
||||
let file = FileBlockReader::new(VirtualFile::open(path).await?);
|
||||
let file = FileBlockReader::new(VirtualFile::open(path)?);
|
||||
let summary_blk = file.read_blk(0).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
@@ -69,7 +68,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let cursor = BlockCursor::new_fileblockreader(&file);
|
||||
let cursor = BlockCursor::new_fileblockreader_virtual(&file);
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos()).await?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
@@ -81,13 +80,13 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
match cmd {
|
||||
LayerCmd::List { path } => {
|
||||
for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? {
|
||||
for tenant in fs::read_dir(path.join("tenants"))? {
|
||||
let tenant = tenant?;
|
||||
if !tenant.file_type()?.is_dir() {
|
||||
continue;
|
||||
}
|
||||
println!("tenant {}", tenant.file_name().to_string_lossy());
|
||||
for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
|
||||
for timeline in fs::read_dir(tenant.path().join("timelines"))? {
|
||||
let timeline = timeline?;
|
||||
if !timeline.file_type()?.is_dir() {
|
||||
continue;
|
||||
@@ -102,9 +101,9 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
timeline,
|
||||
} => {
|
||||
let timeline_path = path
|
||||
.join(TENANTS_SEGMENT_NAME)
|
||||
.join("tenants")
|
||||
.join(tenant)
|
||||
.join(TIMELINES_SEGMENT_NAME)
|
||||
.join("timelines")
|
||||
.join(timeline);
|
||||
let mut idx = 0;
|
||||
for layer in fs::read_dir(timeline_path)? {
|
||||
|
||||
@@ -388,7 +388,6 @@ fn start_pageserver(
|
||||
remote_storage: remote_storage.clone(),
|
||||
},
|
||||
order,
|
||||
shutdown_pageserver.clone(),
|
||||
))?;
|
||||
|
||||
BACKGROUND_RUNTIME.spawn({
|
||||
|
||||
@@ -32,8 +32,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
|
||||
TIMELINES_SEGMENT_NAME,
|
||||
TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||
@@ -205,8 +204,6 @@ pub struct PageServerConf {
|
||||
/// has it's initial logical size calculated. Not running background tasks for some seconds is
|
||||
/// not terrible.
|
||||
pub background_task_maximum_delay: Duration,
|
||||
|
||||
pub control_plane_api: Option<Url>,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -281,8 +278,6 @@ struct PageServerConfigBuilder {
|
||||
ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
|
||||
|
||||
background_task_maximum_delay: BuilderValue<Duration>,
|
||||
|
||||
control_plane_api: BuilderValue<Option<Url>>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -345,8 +340,6 @@ impl Default for PageServerConfigBuilder {
|
||||
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
|
||||
)
|
||||
.unwrap()),
|
||||
|
||||
control_plane_api: Set(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -475,10 +468,6 @@ impl PageServerConfigBuilder {
|
||||
self.background_task_maximum_delay = BuilderValue::Set(delay);
|
||||
}
|
||||
|
||||
pub fn control_plane_api(&mut self, api: Url) {
|
||||
self.control_plane_api = BuilderValue::Set(Some(api))
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_size_logical_size_queries = self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
@@ -564,9 +553,6 @@ impl PageServerConfigBuilder {
|
||||
background_task_maximum_delay: self
|
||||
.background_task_maximum_delay
|
||||
.ok_or(anyhow!("missing background_task_maximum_delay"))?,
|
||||
control_plane_api: self
|
||||
.control_plane_api
|
||||
.ok_or(anyhow!("missing control_plane_api"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -577,7 +563,7 @@ impl PageServerConf {
|
||||
//
|
||||
|
||||
pub fn tenants_path(&self) -> PathBuf {
|
||||
self.workdir.join(TENANTS_SEGMENT_NAME)
|
||||
self.workdir.join("tenants")
|
||||
}
|
||||
|
||||
pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
@@ -755,7 +741,6 @@ impl PageServerConf {
|
||||
},
|
||||
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
|
||||
"background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
|
||||
"control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -924,7 +909,6 @@ impl PageServerConf {
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
background_task_maximum_delay: Duration::ZERO,
|
||||
control_plane_api: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1148,7 +1132,6 @@ background_task_maximum_delay = '334 s'
|
||||
background_task_maximum_delay: humantime::parse_duration(
|
||||
defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
|
||||
)?,
|
||||
control_plane_api: None
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1204,7 +1187,6 @@ background_task_maximum_delay = '334 s'
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
background_task_maximum_delay: Duration::from_secs(334),
|
||||
control_plane_api: None
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -1,119 +0,0 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use url::Url;
|
||||
use utils::{
|
||||
backoff,
|
||||
generation::Generation,
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
|
||||
// Backoffs when control plane requests do not succeed: compromise between reducing load
|
||||
// on control plane, and retrying frequently when we are blocked on a control plane
|
||||
// response to make progress.
|
||||
const BACKOFF_INCREMENT: f64 = 0.1;
|
||||
const BACKOFF_MAX: f64 = 10.0;
|
||||
|
||||
/// The Pageserver's client for using the control plane API: this is a small subset
|
||||
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
||||
pub(crate) struct ControlPlaneClient {
|
||||
http_client: reqwest::Client,
|
||||
base_url: Url,
|
||||
node_id: NodeId,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl ControlPlaneClient {
|
||||
/// A None return value indicates that the input `conf` object does not have control
|
||||
/// plane API enabled.
|
||||
pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
|
||||
let mut url = match conf.control_plane_api.as_ref() {
|
||||
Some(u) => u.clone(),
|
||||
None => return None,
|
||||
};
|
||||
|
||||
if let Ok(mut segs) = url.path_segments_mut() {
|
||||
// This ensures that `url` ends with a slash if it doesn't already.
|
||||
// That way, we can subsequently use join() to safely attach extra path elements.
|
||||
segs.pop_if_empty().push("");
|
||||
}
|
||||
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client");
|
||||
|
||||
Some(Self {
|
||||
http_client: client,
|
||||
base_url: url,
|
||||
node_id: conf.id,
|
||||
cancel: cancel.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn try_re_attach(
|
||||
&self,
|
||||
url: Url,
|
||||
request: &ReAttachRequest,
|
||||
) -> anyhow::Result<ReAttachResponse> {
|
||||
match self.http_client.post(url).json(request).send().await {
|
||||
Err(e) => Err(anyhow::Error::from(e)),
|
||||
Ok(r) => {
|
||||
if r.status() == StatusCode::OK {
|
||||
r.json::<ReAttachResponse>()
|
||||
.await
|
||||
.map_err(anyhow::Error::from)
|
||||
} else {
|
||||
Err(anyhow::anyhow!("Unexpected status {}", r.status()))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Block until we get a successful response
|
||||
pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
|
||||
let re_attach_path = self
|
||||
.base_url
|
||||
.join("re-attach")
|
||||
.expect("Failed to build re-attach path");
|
||||
let request = ReAttachRequest {
|
||||
node_id: self.node_id,
|
||||
};
|
||||
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
let result = self.try_re_attach(re_attach_path.clone(), &request).await;
|
||||
match result {
|
||||
Ok(res) => {
|
||||
tracing::info!(
|
||||
"Received re-attach response with {} tenants",
|
||||
res.tenants.len()
|
||||
);
|
||||
|
||||
return Ok(res
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|t| (t.id, Generation::new(t.generation)))
|
||||
.collect::<HashMap<_, _>>());
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Error re-attaching tenants, retrying: {e:#}");
|
||||
backoff::exponential_backoff(
|
||||
attempt,
|
||||
BACKOFF_INCREMENT,
|
||||
BACKOFF_MAX,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(anyhow::anyhow!("Shutting down"));
|
||||
}
|
||||
attempt += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -383,6 +383,7 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
|
||||
post:
|
||||
description: |
|
||||
Schedules attach operation to happen in the background for the given tenant.
|
||||
@@ -1019,9 +1020,6 @@ components:
|
||||
properties:
|
||||
config:
|
||||
$ref: '#/components/schemas/TenantConfig'
|
||||
generation:
|
||||
type: integer
|
||||
description: Attachment generation number.
|
||||
TenantConfigRequest:
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/TenantConfig'
|
||||
|
||||
@@ -8,9 +8,7 @@ use anyhow::{anyhow, Context, Result};
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest,
|
||||
};
|
||||
use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
@@ -34,13 +32,11 @@ use crate::tenant::mgr::{
|
||||
};
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
generation::Generation,
|
||||
http::{
|
||||
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
|
||||
error::{ApiError, HttpErrorBody},
|
||||
@@ -476,7 +472,7 @@ async fn tenant_attach_handler(
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
|
||||
let tenant_conf = match &maybe_body {
|
||||
let tenant_conf = match maybe_body {
|
||||
Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
|
||||
None => TenantConfOpt::default(),
|
||||
};
|
||||
@@ -487,13 +483,10 @@ async fn tenant_attach_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
|
||||
|
||||
if let Some(remote_storage) = &state.remote_storage {
|
||||
mgr::attach_tenant(
|
||||
state.conf,
|
||||
tenant_id,
|
||||
generation,
|
||||
tenant_conf,
|
||||
state.broker_client.clone(),
|
||||
remote_storage.clone(),
|
||||
@@ -545,7 +538,7 @@ async fn tenant_detach_handler(
|
||||
}
|
||||
|
||||
async fn tenant_load_handler(
|
||||
mut request: Request<Body>,
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
@@ -553,18 +546,10 @@ async fn tenant_load_handler(
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
// The /load request is only usable when control_plane_api is not set. Once it is set, callers
|
||||
// should always use /attach instead.
|
||||
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
|
||||
|
||||
mgr::load_tenant(
|
||||
state.conf,
|
||||
tenant_id,
|
||||
generation,
|
||||
state.broker_client.clone(),
|
||||
state.remote_storage.clone(),
|
||||
&ctx,
|
||||
@@ -866,21 +851,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Helper for requests that may take a generation, which is mandatory
|
||||
/// when control_plane_api is set, but otherwise defaults to Generation::none()
|
||||
fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
|
||||
if state.conf.control_plane_api.is_some() {
|
||||
req_gen
|
||||
.map(Generation::new)
|
||||
.ok_or(ApiError::BadRequest(anyhow!(
|
||||
"generation attribute missing"
|
||||
)))
|
||||
} else {
|
||||
// Legacy mode: all tenants operate with no generation
|
||||
Ok(Generation::none())
|
||||
}
|
||||
}
|
||||
|
||||
async fn tenant_create_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -897,17 +867,14 @@ async fn tenant_create_handler(
|
||||
let tenant_conf =
|
||||
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let generation = get_request_generation(state, request_data.generation)?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let new_tenant = mgr::create_tenant(
|
||||
state.conf,
|
||||
tenant_conf,
|
||||
target_tenant_id,
|
||||
generation,
|
||||
state.broker_client.clone(),
|
||||
state.remote_storage.clone(),
|
||||
&ctx,
|
||||
|
||||
@@ -3,7 +3,6 @@ pub mod basebackup;
|
||||
pub mod config;
|
||||
pub mod consumption_metrics;
|
||||
pub mod context;
|
||||
mod control_plane_client;
|
||||
pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
|
||||
@@ -32,6 +32,7 @@ use std::fmt::Debug;
|
||||
use std::fmt::Display;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::Path;
|
||||
@@ -113,6 +114,7 @@ pub mod block_io;
|
||||
pub mod disk_btree;
|
||||
pub(crate) mod ephemeral_file;
|
||||
pub mod layer_map;
|
||||
pub mod manifest;
|
||||
mod span;
|
||||
|
||||
pub mod metadata;
|
||||
@@ -141,9 +143,6 @@ pub use crate::tenant::metadata::save_metadata;
|
||||
// re-export for use in walreceiver
|
||||
pub use crate::tenant::timeline::WalReceiverInfo;
|
||||
|
||||
/// The "tenants" part of `tenants/<tenant>/timelines...`
|
||||
pub const TENANTS_SEGMENT_NAME: &str = "tenants";
|
||||
|
||||
/// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
|
||||
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
|
||||
@@ -195,7 +194,7 @@ pub struct Tenant {
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
|
||||
// provides access to timeline data sitting in the remote storage
|
||||
pub(crate) remote_storage: Option<GenericRemoteStorage>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
|
||||
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
@@ -407,6 +406,7 @@ impl Tenant {
|
||||
remote_startup_data: Option<RemoteStartupData>,
|
||||
local_metadata: Option<TimelineMetadata>,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
first_save: bool,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -440,9 +440,15 @@ impl Tenant {
|
||||
|
||||
// Save the metadata file to local disk.
|
||||
if !picked_local {
|
||||
save_metadata(self.conf, &tenant_id, &timeline_id, up_to_date_metadata)
|
||||
.await
|
||||
.context("save_metadata")?;
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
up_to_date_metadata,
|
||||
first_save,
|
||||
)
|
||||
.await
|
||||
.context("save_metadata")?;
|
||||
}
|
||||
|
||||
let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
|
||||
@@ -827,6 +833,7 @@ impl Tenant {
|
||||
}),
|
||||
local_metadata,
|
||||
ancestor,
|
||||
true,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
@@ -1379,6 +1386,7 @@ impl Tenant {
|
||||
remote_startup_data,
|
||||
Some(local_metadata),
|
||||
ancestor,
|
||||
false,
|
||||
init_order,
|
||||
ctx,
|
||||
)
|
||||
@@ -1512,15 +1520,6 @@ impl Tenant {
|
||||
tline.maybe_spawn_flush_loop();
|
||||
tline.freeze_and_flush().await.context("freeze_and_flush")?;
|
||||
|
||||
// Make sure the freeze_and_flush reaches remote storage.
|
||||
tline
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.wait_completion()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let tl = uninit_tl.finish_creation()?;
|
||||
// The non-test code would call tl.activate() here.
|
||||
tl.set_state(TimelineState::Active);
|
||||
@@ -1697,6 +1696,65 @@ impl Tenant {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Flush all in-memory data to disk and remote storage, if any.
|
||||
///
|
||||
/// Used at graceful shutdown.
|
||||
async fn freeze_and_flush_on_shutdown(&self) {
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
|
||||
// execute on each timeline on the JoinSet, join after.
|
||||
let per_timeline = |timeline_id: TimelineId, timeline: Arc<Timeline>| {
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
match timeline.freeze_and_flush().await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
warn!("failed to freeze and flush: {e:#}");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let res = if let Some(client) = timeline.remote_client.as_ref() {
|
||||
// if we did not wait for completion here, it might be our shutdown process
|
||||
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
||||
// be spawned.
|
||||
//
|
||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||
// obviously it does not make sense to stop while we wait for it, but what
|
||||
// about corner cases like s3 suddenly hanging up?
|
||||
client.wait_completion().await
|
||||
} else {
|
||||
Ok(())
|
||||
};
|
||||
|
||||
if let Err(e) = res {
|
||||
warn!("failed to await for frozen and flushed uploads: {e:#}");
|
||||
}
|
||||
}
|
||||
.instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id))
|
||||
};
|
||||
|
||||
{
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
timelines
|
||||
.iter()
|
||||
.map(|(id, tl)| (*id, Arc::clone(tl)))
|
||||
.for_each(|(timeline_id, timeline)| {
|
||||
js.spawn(per_timeline(timeline_id, timeline));
|
||||
})
|
||||
};
|
||||
|
||||
while let Some(res) = js.join_next().await {
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
|
||||
Err(je) if je.is_panic() => { /* logged already */ }
|
||||
Err(je) => warn!("unexpected JoinError: {je:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn current_state(&self) -> TenantState {
|
||||
self.state.borrow().clone()
|
||||
}
|
||||
@@ -1827,22 +1885,19 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
{
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
timelines.values().for_each(|timeline| {
|
||||
let timeline = Arc::clone(timeline);
|
||||
let span = Span::current();
|
||||
js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
|
||||
})
|
||||
};
|
||||
while let Some(res) = js.join_next().await {
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
|
||||
Err(je) if je.is_panic() => { /* logged already */ }
|
||||
Err(je) => warn!("unexpected JoinError: {je:?}"),
|
||||
}
|
||||
if freeze_and_flush {
|
||||
// walreceiver has already began to shutdown with TenantState::Stopping, but we need to
|
||||
// await for them to stop.
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::WalReceiverManager),
|
||||
Some(self.tenant_id),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
// this will wait for uploads to complete; in the past, it was done outside tenant
|
||||
// shutdown in pageserver::shutdown_pageserver.
|
||||
self.freeze_and_flush_on_shutdown().await;
|
||||
}
|
||||
|
||||
// shutdown all tenant and timeline tasks: gc, compaction, page service
|
||||
@@ -2374,32 +2429,67 @@ impl Tenant {
|
||||
tenant_id: &TenantId,
|
||||
target_config_path: &Path,
|
||||
tenant_conf: TenantConfOpt,
|
||||
creating_tenant: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// imitate a try-block with a closure
|
||||
info!("persisting tenantconf to {}", target_config_path.display());
|
||||
let do_persist = |target_config_path: PathBuf| -> _ {
|
||||
async move {
|
||||
let target_config_parent = target_config_path.parent().with_context(|| {
|
||||
format!(
|
||||
"Config path does not have a parent: {}",
|
||||
target_config_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut conf_content = r#"# This file contains a specific per-tenant's config.
|
||||
info!("persisting tenantconf to {}", target_config_path.display());
|
||||
|
||||
let mut conf_content = r#"# This file contains a specific per-tenant's config.
|
||||
# It is read in case of pageserver restart.
|
||||
|
||||
[tenant_config]
|
||||
"#
|
||||
.to_string();
|
||||
.to_string();
|
||||
|
||||
// Convert the config to a toml file.
|
||||
conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
|
||||
// Convert the config to a toml file.
|
||||
conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
|
||||
|
||||
let conf_content = conf_content.as_bytes();
|
||||
let mut target_config_file = VirtualFile::open_with_options(
|
||||
&target_config_path,
|
||||
OpenOptions::new()
|
||||
.truncate(true) // This needed for overwriting with small config files
|
||||
.write(true)
|
||||
.create_new(creating_tenant)
|
||||
// when creating a new tenant, first_save will be true and `.create(true)` will be
|
||||
// ignored (per rust std docs).
|
||||
//
|
||||
// later when updating the config of created tenant, or persisting config for the
|
||||
// first time for attached tenant, the `.create(true)` is used.
|
||||
.create(true),
|
||||
)?;
|
||||
|
||||
let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
|
||||
VirtualFile::crashsafe_overwrite(target_config_path, &temp_path, conf_content)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"write tenant {tenant_id} config to {}",
|
||||
target_config_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
target_config_file
|
||||
.write_and_fsync(conf_content.as_bytes())
|
||||
.context("write tenant config toml bytes into file")?;
|
||||
|
||||
// fsync the parent directory to ensure the directory entry is durable.
|
||||
// before this was done conditionally on creating_tenant, but these management actions are rare
|
||||
// enough to just fsync it always.
|
||||
|
||||
crashsafe::fsync(target_config_parent)?;
|
||||
// XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant`
|
||||
Result::<_, anyhow::Error>::Ok(())
|
||||
}
|
||||
};
|
||||
|
||||
let pb = target_config_path.to_owned();
|
||||
// this function is called from creating the tenant and updating the tenant config, which
|
||||
// would otherwise share this context, so keep it here in one place.
|
||||
do_persist(pb).await.with_context(|| {
|
||||
format!(
|
||||
"write tenant {tenant_id} config to {}",
|
||||
target_config_path.display()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
//
|
||||
@@ -2933,9 +3023,15 @@ impl Tenant {
|
||||
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
|
||||
});
|
||||
|
||||
save_metadata(self.conf, &self.tenant_id, new_timeline_id, new_metadata)
|
||||
.await
|
||||
.context("Failed to create timeline metadata")?;
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&self.tenant_id,
|
||||
new_timeline_id,
|
||||
new_metadata,
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.context("Failed to create timeline metadata")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3176,7 +3272,8 @@ async fn try_create_target_tenant_dir(
|
||||
)
|
||||
.with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
|
||||
|
||||
Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf).await?;
|
||||
Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf, true)
|
||||
.await?;
|
||||
|
||||
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
|
||||
format!(
|
||||
@@ -3381,8 +3478,6 @@ pub mod harness {
|
||||
pub tenant_conf: TenantConf,
|
||||
pub tenant_id: TenantId,
|
||||
pub generation: Generation,
|
||||
pub remote_storage: GenericRemoteStorage,
|
||||
pub remote_fs_dir: PathBuf,
|
||||
}
|
||||
|
||||
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
|
||||
@@ -3420,39 +3515,29 @@ pub mod harness {
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
fs::create_dir_all(conf.timelines_path(&tenant_id))?;
|
||||
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
let remote_fs_dir = conf.workdir.join("localfs");
|
||||
std::fs::create_dir_all(&remote_fs_dir).unwrap();
|
||||
let config = RemoteStorageConfig {
|
||||
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
|
||||
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
|
||||
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
};
|
||||
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
generation: Generation::new(0xdeadbeef),
|
||||
remote_storage,
|
||||
remote_fs_dir,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
(
|
||||
self.try_load(&ctx)
|
||||
self.try_load(&ctx, None)
|
||||
.await
|
||||
.expect("failed to load test tenant"),
|
||||
ctx,
|
||||
)
|
||||
}
|
||||
|
||||
pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
|
||||
pub async fn try_load(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
remote_storage: Option<remote_storage::GenericRemoteStorage>,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
@@ -3462,7 +3547,7 @@ pub mod harness {
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
self.generation,
|
||||
Some(self.remote_storage.clone()),
|
||||
remote_storage,
|
||||
));
|
||||
tenant
|
||||
.load(None, ctx)
|
||||
@@ -3933,13 +4018,6 @@ mod tests {
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
|
||||
// so that all uploads finish & we can call harness.load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
|
||||
.await
|
||||
.ok()
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let (tenant, _ctx) = harness.load().await;
|
||||
@@ -3973,14 +4051,6 @@ mod tests {
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
|
||||
|
||||
// so that all uploads finish & we can call harness.load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
|
||||
.await
|
||||
.ok()
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
// check that both of them are initially unloaded
|
||||
@@ -4033,13 +4103,6 @@ mod tests {
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
drop(tline);
|
||||
// so that all uploads finish & we can call harness.try_load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
|
||||
.await
|
||||
.ok()
|
||||
.unwrap();
|
||||
drop(tenant);
|
||||
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
|
||||
@@ -4051,7 +4114,11 @@ mod tests {
|
||||
metadata_bytes[8] ^= 1;
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let err = harness.try_load(&ctx).await.err().expect("should fail");
|
||||
let err = harness
|
||||
.try_load(&ctx, None)
|
||||
.await
|
||||
.err()
|
||||
.expect("should fail");
|
||||
// get all the stack with all .context, not only the last one
|
||||
let message = format!("{err:#}");
|
||||
let expected = "failed to load metadata";
|
||||
@@ -4507,11 +4574,6 @@ mod tests {
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
// Keeps uninit mark in place
|
||||
let raw_tline = tline.raw_timeline().unwrap();
|
||||
raw_tline
|
||||
.shutdown(false)
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
|
||||
.await;
|
||||
std::mem::forget(tline);
|
||||
}
|
||||
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
//!
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
|
||||
@@ -84,24 +83,29 @@ impl<'a> BlockCursor<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// A wrapper of `VirtualFile` that allows users to write blobs.
|
||||
///
|
||||
/// If a `BlobWriter` is dropped, the internal buffer will be
|
||||
/// discarded. You need to call [`flush_buffer`](Self::flush_buffer)
|
||||
/// manually before dropping.
|
||||
pub struct BlobWriter<const BUFFERED: bool> {
|
||||
inner: VirtualFile,
|
||||
offset: u64,
|
||||
/// A buffer to save on write calls, only used if BUFFERED=true
|
||||
buf: Vec<u8>,
|
||||
/// Abstract trait for a data sink that you can write blobs to.
|
||||
///
|
||||
pub trait BlobWriter {
|
||||
/// Write a blob of data. Returns the offset that it was written to,
|
||||
/// which can be used to retrieve the data later.
|
||||
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error>;
|
||||
}
|
||||
|
||||
impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
|
||||
Self {
|
||||
///
|
||||
/// An implementation of BlobWriter to write blobs to anything that
|
||||
/// implements std::io::Write.
|
||||
///
|
||||
pub struct WriteBlobWriter<W> {
|
||||
inner: W,
|
||||
offset: u64,
|
||||
}
|
||||
|
||||
impl<W> WriteBlobWriter<W> {
|
||||
pub fn new(inner: W, start_offset: u64) -> Self {
|
||||
WriteBlobWriter {
|
||||
inner,
|
||||
offset: start_offset,
|
||||
buf: Vec::with_capacity(Self::CAPACITY),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,79 +113,28 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
self.offset
|
||||
}
|
||||
|
||||
const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
|
||||
|
||||
#[inline(always)]
|
||||
/// Writes the given buffer directly to the underlying `VirtualFile`.
|
||||
/// You need to make sure that the internal buffer is empty, otherwise
|
||||
/// data will be written in wrong order.
|
||||
async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> {
|
||||
self.inner.write_all(src_buf).await?;
|
||||
self.offset += src_buf.len() as u64;
|
||||
Ok(())
|
||||
/// Access the underlying Write object.
|
||||
///
|
||||
/// NOTE: WriteBlobWriter keeps track of the current write offset. If
|
||||
/// you write something directly to the inner Write object, it makes the
|
||||
/// internally tracked 'offset' to go out of sync. So don't do that.
|
||||
pub fn into_inner(self) -> W {
|
||||
self.inner
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
/// Flushes the internal buffer to the underlying `VirtualFile`.
|
||||
pub async fn flush_buffer(&mut self) -> Result<(), Error> {
|
||||
self.inner.write_all(&self.buf).await?;
|
||||
self.buf.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
/// Writes as much of `src_buf` into the internal buffer as it fits
|
||||
fn write_into_buffer(&mut self, src_buf: &[u8]) -> usize {
|
||||
let remaining = Self::CAPACITY - self.buf.len();
|
||||
let to_copy = src_buf.len().min(remaining);
|
||||
self.buf.extend_from_slice(&src_buf[..to_copy]);
|
||||
self.offset += to_copy as u64;
|
||||
to_copy
|
||||
}
|
||||
|
||||
/// Internal, possibly buffered, write function
|
||||
async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> {
|
||||
if !BUFFERED {
|
||||
assert!(self.buf.is_empty());
|
||||
self.write_all_unbuffered(src_buf).await?;
|
||||
return Ok(());
|
||||
}
|
||||
let remaining = Self::CAPACITY - self.buf.len();
|
||||
// First try to copy as much as we can into the buffer
|
||||
if remaining > 0 {
|
||||
let copied = self.write_into_buffer(src_buf);
|
||||
src_buf = &src_buf[copied..];
|
||||
}
|
||||
// Then, if the buffer is full, flush it out
|
||||
if self.buf.len() == Self::CAPACITY {
|
||||
self.flush_buffer().await?;
|
||||
}
|
||||
// Finally, write the tail of src_buf:
|
||||
// If it wholly fits into the buffer without
|
||||
// completely filling it, then put it there.
|
||||
// If not, write it out directly.
|
||||
if !src_buf.is_empty() {
|
||||
assert_eq!(self.buf.len(), 0);
|
||||
if src_buf.len() < Self::CAPACITY {
|
||||
let copied = self.write_into_buffer(src_buf);
|
||||
// We just verified above that src_buf fits into our internal buffer.
|
||||
assert_eq!(copied, src_buf.len());
|
||||
} else {
|
||||
self.write_all_unbuffered(src_buf).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write a blob of data. Returns the offset that it was written to,
|
||||
/// which can be used to retrieve the data later.
|
||||
pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
|
||||
impl<W> BlobWriter for WriteBlobWriter<W>
|
||||
where
|
||||
W: std::io::Write,
|
||||
{
|
||||
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
|
||||
let offset = self.offset;
|
||||
|
||||
if srcbuf.len() < 128 {
|
||||
// Short blob. Write a 1-byte length header
|
||||
let len_buf = srcbuf.len() as u8;
|
||||
self.write_all(&[len_buf]).await?;
|
||||
self.inner.write_all(&[len_buf])?;
|
||||
self.offset += 1;
|
||||
} else {
|
||||
// Write a 4-byte length header
|
||||
if srcbuf.len() > 0x7fff_ffff {
|
||||
@@ -192,153 +145,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
}
|
||||
let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
self.write_all(&len_buf).await?;
|
||||
self.inner.write_all(&len_buf)?;
|
||||
self.offset += 4;
|
||||
}
|
||||
self.write_all(srcbuf).await?;
|
||||
self.inner.write_all(srcbuf)?;
|
||||
self.offset += srcbuf.len() as u64;
|
||||
Ok(offset)
|
||||
}
|
||||
}
|
||||
|
||||
impl BlobWriter<true> {
|
||||
/// Access the underlying `VirtualFile`.
|
||||
///
|
||||
/// This function flushes the internal buffer before giving access
|
||||
/// to the underlying `VirtualFile`.
|
||||
pub async fn into_inner(mut self) -> Result<VirtualFile, Error> {
|
||||
self.flush_buffer().await?;
|
||||
Ok(self.inner)
|
||||
}
|
||||
|
||||
/// Access the underlying `VirtualFile`.
|
||||
///
|
||||
/// Unlike [`into_inner`](Self::into_inner), this doesn't flush
|
||||
/// the internal buffer before giving access.
|
||||
pub fn into_inner_no_flush(self) -> VirtualFile {
|
||||
self.inner
|
||||
}
|
||||
}
|
||||
|
||||
impl BlobWriter<false> {
|
||||
/// Access the underlying `VirtualFile`.
|
||||
pub fn into_inner(self) -> VirtualFile {
|
||||
self.inner
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::block_io::BlockReaderRef;
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
|
||||
let temp_dir = tempfile::tempdir()?;
|
||||
let path = temp_dir.path().join("file");
|
||||
|
||||
// Write part (in block to drop the file)
|
||||
let mut offsets = Vec::new();
|
||||
{
|
||||
let file = VirtualFile::create(&path).await?;
|
||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||
for blob in blobs.iter() {
|
||||
let offs = wtr.write_blob(blob).await?;
|
||||
offsets.push(offs);
|
||||
}
|
||||
// Write out one page worth of zeros so that we can
|
||||
// read again with read_blk
|
||||
let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?;
|
||||
println!("Writing final blob at offs={offs}");
|
||||
wtr.flush_buffer().await?;
|
||||
}
|
||||
|
||||
let file = VirtualFile::open(&path).await?;
|
||||
let rdr = BlockReaderRef::VirtualFile(&file);
|
||||
let rdr = BlockCursor::new(rdr);
|
||||
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
|
||||
let blob_read = rdr.read_blob(*offset).await?;
|
||||
assert_eq!(
|
||||
blob, &blob_read,
|
||||
"mismatch for idx={idx} at offset={offset}"
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn random_array(len: usize) -> Vec<u8> {
|
||||
let mut rng = rand::thread_rng();
|
||||
(0..len).map(|_| rng.gen()).collect::<_>()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_one() -> Result<(), Error> {
|
||||
let blobs = &[vec![12, 21, 22]];
|
||||
round_trip_test::<false>(blobs).await?;
|
||||
round_trip_test::<true>(blobs).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_hello_simple() -> Result<(), Error> {
|
||||
let blobs = &[
|
||||
vec![0, 1, 2, 3],
|
||||
b"Hello, World!".to_vec(),
|
||||
Vec::new(),
|
||||
b"foobar".to_vec(),
|
||||
];
|
||||
round_trip_test::<false>(blobs).await?;
|
||||
round_trip_test::<true>(blobs).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_really_big_array() -> Result<(), Error> {
|
||||
let blobs = &[
|
||||
b"test".to_vec(),
|
||||
random_array(10 * PAGE_SZ),
|
||||
b"foobar".to_vec(),
|
||||
];
|
||||
round_trip_test::<false>(blobs).await?;
|
||||
round_trip_test::<true>(blobs).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_arrays_inc() -> Result<(), Error> {
|
||||
let blobs = (0..PAGE_SZ / 8)
|
||||
.map(|v| random_array(v * 16))
|
||||
.collect::<Vec<_>>();
|
||||
round_trip_test::<false>(&blobs).await?;
|
||||
round_trip_test::<true>(&blobs).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_arrays_random_size() -> Result<(), Error> {
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
||||
let blobs = (0..1024)
|
||||
.map(|_| {
|
||||
let mut sz: u16 = rng.gen();
|
||||
// Make 50% of the arrays small
|
||||
if rng.gen() {
|
||||
sz |= 63;
|
||||
}
|
||||
random_array(sz.into())
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
round_trip_test::<false>(&blobs).await?;
|
||||
round_trip_test::<true>(&blobs).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_arrays_page_boundary() -> Result<(), Error> {
|
||||
let blobs = &[
|
||||
random_array(PAGE_SZ - 4),
|
||||
random_array(PAGE_SZ - 4),
|
||||
random_array(PAGE_SZ - 4),
|
||||
];
|
||||
round_trip_test::<false>(blobs).await?;
|
||||
round_trip_test::<true>(blobs).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,13 +71,11 @@ impl<'a> Deref for BlockLease<'a> {
|
||||
///
|
||||
/// Unlike traits, we also support the read function to be async though.
|
||||
pub(crate) enum BlockReaderRef<'a> {
|
||||
FileBlockReaderVirtual(&'a FileBlockReader),
|
||||
FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
|
||||
EphemeralFile(&'a EphemeralFile),
|
||||
Adapter(Adapter<&'a DeltaLayerInner>),
|
||||
#[cfg(test)]
|
||||
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
||||
#[cfg(test)]
|
||||
VirtualFile(&'a VirtualFile),
|
||||
}
|
||||
|
||||
impl<'a> BlockReaderRef<'a> {
|
||||
@@ -90,8 +88,6 @@ impl<'a> BlockReaderRef<'a> {
|
||||
Adapter(r) => r.read_blk(blknum).await,
|
||||
#[cfg(test)]
|
||||
TestDisk(r) => r.read_blk(blknum),
|
||||
#[cfg(test)]
|
||||
VirtualFile(r) => r.read_blk(blknum).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -105,7 +101,7 @@ impl<'a> BlockReaderRef<'a> {
|
||||
///
|
||||
/// ```no_run
|
||||
/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
|
||||
/// # let reader: FileBlockReader = unimplemented!("stub");
|
||||
/// # let reader: FileBlockReader<std::fs::File> = unimplemented!("stub");
|
||||
/// let cursor = reader.block_cursor();
|
||||
/// let buf = cursor.read_blk(1);
|
||||
/// // do stuff with 'buf'
|
||||
@@ -122,7 +118,7 @@ impl<'a> BlockCursor<'a> {
|
||||
BlockCursor { reader }
|
||||
}
|
||||
// Needed by cli
|
||||
pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
|
||||
pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
|
||||
BlockCursor {
|
||||
reader: BlockReaderRef::FileBlockReaderVirtual(reader),
|
||||
}
|
||||
@@ -143,14 +139,14 @@ impl<'a> BlockCursor<'a> {
|
||||
///
|
||||
/// The file is assumed to be immutable. This doesn't provide any functions
|
||||
/// for modifying the file, nor for invalidating the cache if it is modified.
|
||||
pub struct FileBlockReader {
|
||||
pub file: VirtualFile,
|
||||
pub struct FileBlockReader<F> {
|
||||
pub file: F,
|
||||
|
||||
/// Unique ID of this file, used as key in the page cache.
|
||||
file_id: page_cache::FileId,
|
||||
}
|
||||
|
||||
impl FileBlockReader {
|
||||
impl FileBlockReader<VirtualFile> {
|
||||
pub fn new(file: VirtualFile) -> Self {
|
||||
let file_id = page_cache::next_file_id();
|
||||
|
||||
@@ -195,7 +191,7 @@ impl FileBlockReader {
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for FileBlockReader {
|
||||
impl BlockReader for FileBlockReader<VirtualFile> {
|
||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||
BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ pub struct EphemeralFile {
|
||||
}
|
||||
|
||||
impl EphemeralFile {
|
||||
pub async fn create(
|
||||
pub fn create(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -44,8 +44,7 @@ impl EphemeralFile {
|
||||
let file = VirtualFile::open_with_options(
|
||||
&filename,
|
||||
OpenOptions::new().read(true).write(true).create(true),
|
||||
)
|
||||
.await?;
|
||||
)?;
|
||||
|
||||
Ok(EphemeralFile {
|
||||
page_cache_file_id: page_cache::next_file_id(),
|
||||
@@ -287,7 +286,7 @@ mod tests {
|
||||
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
let pos_foo = file.write_blob(b"foo").await?;
|
||||
assert_eq!(
|
||||
|
||||
327
pageserver/src/tenant/manifest.rs
Normal file
327
pageserver/src/tenant/manifest.rs
Normal file
@@ -0,0 +1,327 @@
|
||||
//! This module contains the encoding and decoding of the local manifest file.
|
||||
//!
|
||||
//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
|
||||
//! records the state of the storage engine. It contains a snapshot of the
|
||||
//! state and all operations proceeding that snapshot. The file begins with a
|
||||
//! header recording MANIFEST version number. After that, it contains a snapshot.
|
||||
//! The snapshot is followed by a list of operations. Each operation is a list
|
||||
//! of records. Each record is either an addition or a removal of a layer.
|
||||
//!
|
||||
//! With MANIFEST, we can:
|
||||
//!
|
||||
//! 1. recover state quickly by reading the file, potentially boosting the
|
||||
//! startup speed.
|
||||
//! 2. ensure all operations are atomic and avoid corruption, solving issues
|
||||
//! like redundant image layer and preparing us for future compaction
|
||||
//! strategies.
|
||||
//!
|
||||
//! There is also a format for storing all layer files on S3, called
|
||||
//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
|
||||
//! records all operations as logs, and therefore we can easily replay the
|
||||
//! operations when recovering from crash, while ensuring those operations
|
||||
//! are atomic upon restart.
|
||||
//!
|
||||
//! Currently, this is not used in the system. Future refactors will ensure
|
||||
//! the storage state will be recorded in this file, and the system can be
|
||||
//! recovered from this file. This is tracked in
|
||||
//! <https://github.com/neondatabase/neon/issues/4418>
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use crc32c::crc32c;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::log::warn;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::storage_layer::PersistentLayerDesc;
|
||||
|
||||
pub struct Manifest {
|
||||
file: VirtualFile,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub struct Snapshot {
|
||||
pub layers: Vec<PersistentLayerDesc>,
|
||||
}
|
||||
|
||||
/// serde by default encode this in tagged enum, and therefore it will be something
|
||||
/// like `{ "AddLayer": { ... } }`.
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub enum Record {
|
||||
AddLayer(PersistentLayerDesc),
|
||||
RemoveLayer(PersistentLayerDesc),
|
||||
}
|
||||
|
||||
/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
|
||||
const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
|
||||
const MANIFEST_VERSION: u64 = 1;
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub struct ManifestHeader {
|
||||
magic_number: u64,
|
||||
version: u64,
|
||||
}
|
||||
|
||||
const MANIFEST_HEADER_LEN: usize = 16;
|
||||
|
||||
impl ManifestHeader {
|
||||
fn encode(&self) -> BytesMut {
|
||||
let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
|
||||
buf.put_u64(self.magic_number);
|
||||
buf.put_u64(self.version);
|
||||
buf
|
||||
}
|
||||
|
||||
fn decode(mut buf: &[u8]) -> Self {
|
||||
assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
|
||||
Self {
|
||||
magic_number: buf.get_u64(),
|
||||
version: buf.get_u64(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub enum Operation {
|
||||
/// A snapshot of the current state.
|
||||
///
|
||||
/// Lsn field represents the LSN that is persisted to disk for this snapshot.
|
||||
Snapshot(Snapshot, Lsn),
|
||||
/// An atomic operation that changes the state.
|
||||
///
|
||||
/// Lsn field represents the LSN that is persisted to disk after the operation is done.
|
||||
/// This will only change when new L0 is flushed to the disk.
|
||||
Operation(Vec<Record>, Lsn),
|
||||
}
|
||||
|
||||
struct RecordHeader {
|
||||
size: u32,
|
||||
checksum: u32,
|
||||
}
|
||||
|
||||
const RECORD_HEADER_LEN: usize = 8;
|
||||
|
||||
impl RecordHeader {
|
||||
fn encode(&self) -> BytesMut {
|
||||
let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
|
||||
buf.put_u32(self.size);
|
||||
buf.put_u32(self.checksum);
|
||||
buf
|
||||
}
|
||||
|
||||
fn decode(mut buf: &[u8]) -> Self {
|
||||
assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
|
||||
Self {
|
||||
size: buf.get_u32(),
|
||||
checksum: buf.get_u32(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ManifestLoadError {
|
||||
#[error("manifest header is corrupted")]
|
||||
CorruptedManifestHeader,
|
||||
#[error("unsupported manifest version: got {0}, expected {1}")]
|
||||
UnsupportedVersion(u64, u64),
|
||||
#[error("error when decoding record: {0}")]
|
||||
DecodeRecord(serde_json::Error),
|
||||
#[error("I/O error: {0}")]
|
||||
Io(io::Error),
|
||||
}
|
||||
|
||||
#[must_use = "Should check if the manifest is partially corrupted"]
|
||||
pub struct ManifestPartiallyCorrupted(bool);
|
||||
|
||||
impl Manifest {
|
||||
/// Create a new manifest by writing the manifest header and a snapshot record to the given file.
|
||||
pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
|
||||
let mut manifest = Self { file };
|
||||
manifest.append_manifest_header(ManifestHeader {
|
||||
magic_number: MANIFEST_MAGIC_NUMBER,
|
||||
version: MANIFEST_VERSION,
|
||||
})?;
|
||||
manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
|
||||
Ok(manifest)
|
||||
}
|
||||
|
||||
/// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
|
||||
/// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
|
||||
/// backup the current one.
|
||||
pub async fn load(
|
||||
file: VirtualFile,
|
||||
) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
|
||||
let mut buf = vec![];
|
||||
file.read_exact_at(&mut buf, 0)
|
||||
.await
|
||||
.map_err(ManifestLoadError::Io)?;
|
||||
|
||||
// Read manifest header
|
||||
let mut buf = Bytes::from(buf);
|
||||
if buf.remaining() < MANIFEST_HEADER_LEN {
|
||||
return Err(ManifestLoadError::CorruptedManifestHeader);
|
||||
}
|
||||
let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
|
||||
buf.advance(MANIFEST_HEADER_LEN);
|
||||
if header.version != MANIFEST_VERSION {
|
||||
return Err(ManifestLoadError::UnsupportedVersion(
|
||||
header.version,
|
||||
MANIFEST_VERSION,
|
||||
));
|
||||
}
|
||||
|
||||
// Read operations
|
||||
let mut operations = Vec::new();
|
||||
let corrupted = loop {
|
||||
if buf.remaining() == 0 {
|
||||
break false;
|
||||
}
|
||||
if buf.remaining() < RECORD_HEADER_LEN {
|
||||
warn!("incomplete header when decoding manifest, could be corrupted");
|
||||
break true;
|
||||
}
|
||||
let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
|
||||
let size = size as usize;
|
||||
buf.advance(RECORD_HEADER_LEN);
|
||||
if buf.remaining() < size {
|
||||
warn!("incomplete data when decoding manifest, could be corrupted");
|
||||
break true;
|
||||
}
|
||||
let data = &buf[..size];
|
||||
if crc32c(data) != checksum {
|
||||
warn!("checksum mismatch when decoding manifest, could be corrupted");
|
||||
break true;
|
||||
}
|
||||
// if the following decode fails, we cannot use the manifest or safely ignore any record.
|
||||
operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
|
||||
buf.advance(size);
|
||||
};
|
||||
Ok((
|
||||
Self { file },
|
||||
operations,
|
||||
ManifestPartiallyCorrupted(corrupted),
|
||||
))
|
||||
}
|
||||
|
||||
fn append_data(&mut self, data: &[u8]) -> Result<()> {
|
||||
if data.len() >= u32::MAX as usize {
|
||||
panic!("data too large");
|
||||
}
|
||||
let header = RecordHeader {
|
||||
size: data.len() as u32,
|
||||
checksum: crc32c(data),
|
||||
};
|
||||
let header = header.encode();
|
||||
self.file.write_all(&header)?;
|
||||
self.file.write_all(data)?;
|
||||
self.file.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
|
||||
let encoded = header.encode();
|
||||
self.file.write_all(&encoded)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add an operation to the manifest. The operation will be appended to the end of the file,
|
||||
/// and the file will fsync.
|
||||
pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
|
||||
let encoded = Vec::from(serde_json::to_string(&operation)?);
|
||||
self.append_data(&encoded)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs::OpenOptions;
|
||||
|
||||
use crate::repository::Key;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_manifest() {
|
||||
let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
|
||||
std::fs::create_dir_all(&testdir).unwrap();
|
||||
let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
|
||||
let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
|
||||
let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
|
||||
let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
|
||||
let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
|
||||
|
||||
// Write a manifest with a snapshot and some operations
|
||||
let snapshot = Snapshot {
|
||||
layers: vec![layer1, layer2],
|
||||
};
|
||||
let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
|
||||
manifest
|
||||
.append_operation(Operation::Operation(
|
||||
vec![Record::AddLayer(layer3.clone())],
|
||||
Lsn::from(1),
|
||||
))
|
||||
.unwrap();
|
||||
drop(manifest);
|
||||
|
||||
// Open the second time and write
|
||||
let file = VirtualFile::open_with_options(
|
||||
&testdir.join("MANIFEST"),
|
||||
OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create_new(false)
|
||||
.truncate(false),
|
||||
)
|
||||
.unwrap();
|
||||
let (mut manifest, operations, corrupted) = Manifest::load(file).await.unwrap();
|
||||
assert!(!corrupted.0);
|
||||
assert_eq!(operations.len(), 2);
|
||||
assert_eq!(
|
||||
&operations[0],
|
||||
&Operation::Snapshot(snapshot.clone(), Lsn::from(0))
|
||||
);
|
||||
assert_eq!(
|
||||
&operations[1],
|
||||
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
|
||||
);
|
||||
manifest
|
||||
.append_operation(Operation::Operation(
|
||||
vec![
|
||||
Record::RemoveLayer(layer3.clone()),
|
||||
Record::AddLayer(layer4.clone()),
|
||||
],
|
||||
Lsn::from(2),
|
||||
))
|
||||
.unwrap();
|
||||
drop(manifest);
|
||||
|
||||
// Open the third time and verify
|
||||
let file = VirtualFile::open_with_options(
|
||||
&testdir.join("MANIFEST"),
|
||||
OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create_new(false)
|
||||
.truncate(false),
|
||||
)
|
||||
.unwrap();
|
||||
let (_manifest, operations, corrupted) = Manifest::load(file).await.unwrap();
|
||||
assert!(!corrupted.0);
|
||||
assert_eq!(operations.len(), 3);
|
||||
assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
|
||||
assert_eq!(
|
||||
&operations[1],
|
||||
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
|
||||
);
|
||||
assert_eq!(
|
||||
&operations[2],
|
||||
&Operation::Operation(
|
||||
vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
|
||||
Lsn::from(2)
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -8,13 +8,14 @@
|
||||
//!
|
||||
//! [`remote_timeline_client`]: super::remote_timeline_client
|
||||
|
||||
use std::io::{self};
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io;
|
||||
|
||||
use anyhow::{ensure, Context};
|
||||
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
||||
use thiserror::Error;
|
||||
use tracing::info_span;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -23,7 +24,6 @@ use utils::{
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
|
||||
/// Use special format number to enable backward compatibility.
|
||||
const METADATA_FORMAT_VERSION: u16 = 4;
|
||||
@@ -230,23 +230,6 @@ impl TimelineMetadata {
|
||||
pub fn pg_version(&self) -> u32 {
|
||||
self.body.pg_version
|
||||
}
|
||||
|
||||
// Checksums make it awkward to build a valid instance by hand. This helper
|
||||
// provides a TimelineMetadata with a valid checksum in its header.
|
||||
#[cfg(test)]
|
||||
pub fn example() -> Self {
|
||||
let instance = Self::new(
|
||||
"0/16960E8".parse::<Lsn>().unwrap(),
|
||||
None,
|
||||
None,
|
||||
Lsn::from_hex("00000000").unwrap(),
|
||||
Lsn::from_hex("00000000").unwrap(),
|
||||
Lsn::from_hex("00000000").unwrap(),
|
||||
0,
|
||||
);
|
||||
let bytes = instance.to_bytes().unwrap();
|
||||
Self::from_bytes(&bytes).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for TimelineMetadata {
|
||||
@@ -272,19 +255,35 @@ impl Serialize for TimelineMetadata {
|
||||
}
|
||||
|
||||
/// Save timeline metadata to file
|
||||
#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
|
||||
pub async fn save_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
data: &TimelineMetadata,
|
||||
first_save: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter = info_span!("saving metadata").entered();
|
||||
let path = conf.metadata_path(tenant_id, timeline_id);
|
||||
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
||||
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
|
||||
.await
|
||||
.context("write metadata")?;
|
||||
// use OpenOptions to ensure file presence is consistent with first_save
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
&path,
|
||||
OpenOptions::new().write(true).create_new(first_save),
|
||||
)
|
||||
.context("open_with_options")?;
|
||||
|
||||
let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
|
||||
|
||||
file.write_and_fsync(&metadata_bytes)?;
|
||||
|
||||
// fsync the parent directory to ensure the directory entry is durable
|
||||
if first_save {
|
||||
let timeline_dir = File::open(
|
||||
path.parent()
|
||||
.expect("Metadata should always have a parent dir"),
|
||||
)?;
|
||||
timeline_dir.sync_all()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ use anyhow::Context;
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio::sync::RwLock;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
@@ -19,14 +18,12 @@ use utils::crashsafe;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::control_plane_client::ControlPlaneClient;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
|
||||
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -63,29 +60,6 @@ impl TenantsMap {
|
||||
}
|
||||
}
|
||||
|
||||
/// This is "safe" in that that it won't leave behind a partially deleted directory
|
||||
/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
|
||||
/// the contents.
|
||||
///
|
||||
/// This is pageserver-specific, as it relies on future processes after a crash to check
|
||||
/// for TEMP_FILE_SUFFIX when loading things.
|
||||
async fn safe_remove_tenant_dir_all(path: impl AsRef<Path>) -> std::io::Result<()> {
|
||||
let parent = path
|
||||
.as_ref()
|
||||
.parent()
|
||||
// It is invalid to call this function with a relative path. Tenant directories
|
||||
// should always have a parent.
|
||||
.ok_or(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"Path must be absolute",
|
||||
))?;
|
||||
|
||||
let tmp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
||||
fs::rename(&path, &tmp_path).await?;
|
||||
fs::File::open(parent).await?.sync_all().await?;
|
||||
fs::remove_dir_all(tmp_path).await
|
||||
}
|
||||
|
||||
static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
|
||||
|
||||
/// Initialize repositories with locally available timelines.
|
||||
@@ -96,21 +70,12 @@ pub async fn init_tenant_mgr(
|
||||
conf: &'static PageServerConf,
|
||||
resources: TenantSharedResources,
|
||||
init_order: InitializationOrder,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
// Scan local filesystem for attached tenants
|
||||
let tenants_dir = conf.tenants_path();
|
||||
|
||||
let mut tenants = HashMap::new();
|
||||
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
|
||||
Some(client.re_attach().await?)
|
||||
} else {
|
||||
info!("Control plane API not configured, tenant generations are disabled");
|
||||
None
|
||||
};
|
||||
|
||||
let mut dir_entries = fs::read_dir(&tenants_dir)
|
||||
.await
|
||||
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
|
||||
@@ -127,8 +92,6 @@ pub async fn init_tenant_mgr(
|
||||
"Found temporary tenant directory, removing: {}",
|
||||
tenant_dir_path.display()
|
||||
);
|
||||
// No need to use safe_remove_tenant_dir_all because this is already
|
||||
// a temporary path
|
||||
if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
|
||||
error!(
|
||||
"Failed to remove temporary directory '{}': {:?}",
|
||||
@@ -160,53 +123,9 @@ pub async fn init_tenant_mgr(
|
||||
continue;
|
||||
}
|
||||
|
||||
let tenant_id = match tenant_dir_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TenantId>()
|
||||
{
|
||||
Ok(id) => id,
|
||||
Err(_) => {
|
||||
warn!(
|
||||
"Invalid tenant path (garbage in our repo directory?): {}",
|
||||
tenant_dir_path.display()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let generation = if let Some(generations) = &tenant_generations {
|
||||
// We have a generation map: treat it as the authority for whether
|
||||
// this tenant is really attached.
|
||||
if let Some(gen) = generations.get(&tenant_id) {
|
||||
*gen
|
||||
} else {
|
||||
info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response");
|
||||
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
|
||||
error!(
|
||||
"Failed to remove detached tenant directory '{}': {:?}",
|
||||
tenant_dir_path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// Legacy mode: no generation information, any tenant present
|
||||
// on local disk may activate
|
||||
info!(
|
||||
"Starting tenant {} in legacy mode, no generation",
|
||||
tenant_dir_path.display()
|
||||
);
|
||||
Generation::none()
|
||||
};
|
||||
|
||||
match schedule_local_tenant_processing(
|
||||
conf,
|
||||
tenant_id,
|
||||
&tenant_dir_path,
|
||||
generation,
|
||||
resources.clone(),
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
@@ -240,12 +159,9 @@ pub async fn init_tenant_mgr(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn schedule_local_tenant_processing(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
tenant_path: &Path,
|
||||
generation: Generation,
|
||||
resources: TenantSharedResources,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
@@ -266,6 +182,15 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
"Cannot load tenant from empty directory {tenant_path:?}"
|
||||
);
|
||||
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TenantId>()
|
||||
.with_context(|| {
|
||||
format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
|
||||
})?;
|
||||
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||
anyhow::ensure!(
|
||||
!conf.tenant_ignore_mark_file_path(&tenant_id).exists(),
|
||||
@@ -278,7 +203,7 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
match Tenant::spawn_attach(
|
||||
conf,
|
||||
tenant_id,
|
||||
generation,
|
||||
Generation::none(),
|
||||
resources.broker_client,
|
||||
tenants,
|
||||
remote_storage,
|
||||
@@ -302,7 +227,13 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||
// Start loading the tenant into memory. It will initially be in Loading state.
|
||||
Tenant::spawn_load(
|
||||
conf, tenant_id, generation, resources, init_order, tenants, ctx,
|
||||
conf,
|
||||
tenant_id,
|
||||
Generation::none(),
|
||||
resources,
|
||||
init_order,
|
||||
tenants,
|
||||
ctx,
|
||||
)
|
||||
};
|
||||
Ok(tenant)
|
||||
@@ -426,7 +357,6 @@ pub async fn create_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
ctx: &RequestContext,
|
||||
@@ -444,8 +374,7 @@ pub async fn create_tenant(
|
||||
remote_storage,
|
||||
};
|
||||
let created_tenant =
|
||||
schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
|
||||
generation, tenant_resources, None, &TENANTS, ctx)?;
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, tenant_resources, None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -475,7 +404,7 @@ pub async fn set_new_tenant_config(
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
|
||||
let tenant_config_path = conf.tenant_config_path(&tenant_id);
|
||||
Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf)
|
||||
Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf, false)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
@@ -562,7 +491,7 @@ async fn detach_tenant0(
|
||||
) -> Result<(), TenantStateError> {
|
||||
let local_files_cleanup_operation = |tenant_id_to_clean| async move {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
|
||||
safe_remove_tenant_dir_all(&local_tenant_directory)
|
||||
fs::remove_dir_all(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("local tenant directory {local_tenant_directory:?} removal")
|
||||
@@ -593,7 +522,6 @@ async fn detach_tenant0(
|
||||
pub async fn load_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
ctx: &RequestContext,
|
||||
@@ -610,7 +538,7 @@ pub async fn load_tenant(
|
||||
broker_client,
|
||||
remote_storage,
|
||||
};
|
||||
let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx)
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, resources, None, &TENANTS, ctx)
|
||||
.with_context(|| {
|
||||
format!("Failed to schedule tenant processing in path {tenant_path:?}")
|
||||
})?;
|
||||
@@ -674,7 +602,6 @@ pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapLis
|
||||
pub async fn attach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
tenant_conf: TenantConfOpt,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
@@ -696,7 +623,7 @@ pub async fn attach_tenant(
|
||||
broker_client,
|
||||
remote_storage: Some(remote_storage),
|
||||
};
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, resources, None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
|
||||
@@ -4,9 +4,10 @@ use std::{
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
};
|
||||
|
||||
use crate::virtual_file::VirtualFile;
|
||||
|
||||
fn fsync_path(path: &Path) -> io::Result<()> {
|
||||
// TODO use VirtualFile::fsync_all once we fully go async.
|
||||
let file = std::fs::File::open(path)?;
|
||||
let file = VirtualFile::open(path)?;
|
||||
file.sync_all()
|
||||
}
|
||||
|
||||
|
||||
@@ -342,12 +342,7 @@ impl RemoteTimelineClient {
|
||||
) -> RemoteTimelineClient {
|
||||
RemoteTimelineClient {
|
||||
conf,
|
||||
runtime: if cfg!(test) {
|
||||
// remote_timeline_client.rs tests rely on current-thread runtime
|
||||
tokio::runtime::Handle::current()
|
||||
} else {
|
||||
BACKGROUND_RUNTIME.handle().clone()
|
||||
},
|
||||
runtime: BACKGROUND_RUNTIME.handle().to_owned(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
generation,
|
||||
@@ -1430,30 +1425,6 @@ pub fn remote_index_path(
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
/// Given the key of an index, parse out the generation part of the name
|
||||
pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
|
||||
let file_name = match path.get_path().file_name() {
|
||||
Some(f) => f,
|
||||
None => {
|
||||
// Unexpected: we should be seeing index_part.json paths only
|
||||
tracing::warn!("Malformed index key {}", path);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let file_name_str = match file_name.to_str() {
|
||||
Some(s) => s,
|
||||
None => {
|
||||
tracing::warn!("Malformed index key {:?}", path);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
match file_name_str.split_once('-') {
|
||||
Some((_, gen_suffix)) => Generation::parse_suffix(gen_suffix),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Files on the remote storage are stored with paths, relative to the workdir.
|
||||
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
|
||||
///
|
||||
@@ -1492,8 +1463,11 @@ mod tests {
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
use std::{collections::HashSet, path::Path};
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
|
||||
@@ -1550,6 +1524,8 @@ mod tests {
|
||||
tenant: Arc<Tenant>,
|
||||
timeline: Arc<Timeline>,
|
||||
tenant_ctx: RequestContext,
|
||||
remote_fs_dir: PathBuf,
|
||||
client: Arc<RemoteTimelineClient>,
|
||||
}
|
||||
|
||||
impl TestSetup {
|
||||
@@ -1559,44 +1535,54 @@ mod tests {
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
// create an empty timeline directory
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
|
||||
std::fs::create_dir_all(remote_fs_dir)?;
|
||||
let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
|
||||
|
||||
let storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
|
||||
)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
};
|
||||
|
||||
let generation = Generation::new(0xdeadbeef);
|
||||
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
|
||||
let client = Arc::new(RemoteTimelineClient {
|
||||
conf: harness.conf,
|
||||
runtime: tokio::runtime::Handle::current(),
|
||||
tenant_id: harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
generation,
|
||||
storage_impl: storage,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||
&harness.tenant_id,
|
||||
&TIMELINE_ID,
|
||||
)),
|
||||
});
|
||||
|
||||
Ok(Self {
|
||||
harness,
|
||||
tenant,
|
||||
timeline,
|
||||
tenant_ctx: ctx,
|
||||
remote_fs_dir,
|
||||
client,
|
||||
})
|
||||
}
|
||||
|
||||
/// Construct a RemoteTimelineClient in an arbitrary generation
|
||||
fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
|
||||
Arc::new(RemoteTimelineClient {
|
||||
conf: self.harness.conf,
|
||||
runtime: tokio::runtime::Handle::current(),
|
||||
tenant_id: self.harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
generation,
|
||||
storage_impl: self.harness.remote_storage.clone(),
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||
&self.harness.tenant_id,
|
||||
&TIMELINE_ID,
|
||||
)),
|
||||
})
|
||||
}
|
||||
|
||||
/// A tracing::Span that satisfies remote_timeline_client methods that assert tenant_id
|
||||
/// and timeline_id are present.
|
||||
fn span(&self) -> tracing::Span {
|
||||
tracing::info_span!(
|
||||
"test",
|
||||
tenant_id = %self.harness.tenant_id,
|
||||
timeline_id = %TIMELINE_ID
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Test scheduling
|
||||
@@ -1616,44 +1602,29 @@ mod tests {
|
||||
// Schedule another deletion. Check that it's launched immediately.
|
||||
// Schedule index upload. Check that it's queued
|
||||
|
||||
let test_setup = TestSetup::new("upload_scheduling").await.unwrap();
|
||||
let span = test_setup.span();
|
||||
let _guard = span.enter();
|
||||
|
||||
let TestSetup {
|
||||
harness,
|
||||
tenant: _tenant,
|
||||
timeline,
|
||||
timeline: _timeline,
|
||||
tenant_ctx: _tenant_ctx,
|
||||
} = test_setup;
|
||||
|
||||
let client = timeline.remote_client.as_ref().unwrap();
|
||||
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let initial_index_part = match client.download_index_file().await.unwrap() {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
|
||||
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
|
||||
};
|
||||
let initial_layers = initial_index_part
|
||||
.layer_metadata
|
||||
.keys()
|
||||
.map(|f| f.to_owned())
|
||||
.collect::<HashSet<LayerFileName>>();
|
||||
let initial_layer = {
|
||||
assert!(initial_layers.len() == 1);
|
||||
initial_layers.into_iter().next().unwrap()
|
||||
};
|
||||
remote_fs_dir,
|
||||
client,
|
||||
} = TestSetup::new("upload_scheduling").await.unwrap();
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
println!("workdir: {}", harness.conf.workdir.display());
|
||||
|
||||
let remote_timeline_dir = harness
|
||||
.remote_fs_dir
|
||||
.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
|
||||
let remote_timeline_dir =
|
||||
remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
|
||||
println!("remote_timeline_dir: {}", remote_timeline_dir.display());
|
||||
|
||||
let generation = harness.generation;
|
||||
let metadata = dummy_metadata(Lsn(0x10));
|
||||
client
|
||||
.init_upload_queue_for_empty_remote(&metadata)
|
||||
.unwrap();
|
||||
|
||||
let generation = Generation::new(0xdeadbeef);
|
||||
|
||||
// Create a couple of dummy files, schedule upload for them
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
@@ -1734,7 +1705,6 @@ mod tests {
|
||||
.map(|f| f.to_owned())
|
||||
.collect(),
|
||||
&[
|
||||
&initial_layer.file_name(),
|
||||
&layer_file_name_1.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
],
|
||||
@@ -1764,7 +1734,6 @@ mod tests {
|
||||
}
|
||||
assert_remote_files(
|
||||
&[
|
||||
&initial_layer.file_name(),
|
||||
&layer_file_name_1.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
"index_part.json",
|
||||
@@ -1778,7 +1747,6 @@ mod tests {
|
||||
|
||||
assert_remote_files(
|
||||
&[
|
||||
&initial_layer.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
&layer_file_name_3.file_name(),
|
||||
"index_part.json",
|
||||
@@ -1795,10 +1763,16 @@ mod tests {
|
||||
let TestSetup {
|
||||
harness,
|
||||
tenant: _tenant,
|
||||
timeline,
|
||||
timeline: _timeline,
|
||||
client,
|
||||
..
|
||||
} = TestSetup::new("metrics").await.unwrap();
|
||||
let client = timeline.remote_client.as_ref().unwrap();
|
||||
|
||||
let metadata = dummy_metadata(Lsn(0x10));
|
||||
client
|
||||
.init_upload_queue_for_empty_remote(&metadata)
|
||||
.unwrap();
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
@@ -1809,20 +1783,11 @@ mod tests {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
struct BytesStartedFinished {
|
||||
started: Option<usize>,
|
||||
finished: Option<usize>,
|
||||
}
|
||||
impl std::ops::Add for BytesStartedFinished {
|
||||
type Output = Self;
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
Self {
|
||||
started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
|
||||
finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
|
||||
}
|
||||
}
|
||||
}
|
||||
let get_bytes_started_stopped = || {
|
||||
let started = client
|
||||
.metrics
|
||||
@@ -1839,140 +1804,47 @@ mod tests {
|
||||
};
|
||||
|
||||
// Test
|
||||
tracing::info!("now doing actual test");
|
||||
|
||||
let actual_a = get_bytes_started_stopped();
|
||||
let generation = Generation::new(0xdeadbeef);
|
||||
|
||||
let init = get_bytes_started_stopped();
|
||||
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_1,
|
||||
&LayerFileMetadata::new(content_1.len() as u64, harness.generation),
|
||||
&LayerFileMetadata::new(content_1.len() as u64, generation),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let actual_b = get_bytes_started_stopped();
|
||||
let pre = get_bytes_started_stopped();
|
||||
|
||||
client.wait_completion().await.unwrap();
|
||||
|
||||
let actual_c = get_bytes_started_stopped();
|
||||
let post = get_bytes_started_stopped();
|
||||
|
||||
// Validate
|
||||
|
||||
let expected_b = actual_a
|
||||
+ BytesStartedFinished {
|
||||
assert_eq!(
|
||||
init,
|
||||
BytesStartedFinished {
|
||||
started: None,
|
||||
finished: None
|
||||
}
|
||||
);
|
||||
assert_eq!(
|
||||
pre,
|
||||
BytesStartedFinished {
|
||||
started: Some(content_1.len()),
|
||||
// assert that the _finished metric is created eagerly so that subtractions work on first sample
|
||||
finished: Some(0),
|
||||
};
|
||||
assert_eq!(actual_b, expected_b);
|
||||
|
||||
let expected_c = actual_a
|
||||
+ BytesStartedFinished {
|
||||
started: Some(content_1.len()),
|
||||
finished: Some(content_1.len()),
|
||||
};
|
||||
assert_eq!(actual_c, expected_c);
|
||||
}
|
||||
|
||||
async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
|
||||
// An empty IndexPart, just sufficient to ensure deserialization will succeed
|
||||
let example_metadata = TimelineMetadata::example();
|
||||
let example_index_part = IndexPart::new(
|
||||
HashMap::new(),
|
||||
example_metadata.disk_consistent_lsn(),
|
||||
example_metadata,
|
||||
);
|
||||
|
||||
let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
|
||||
|
||||
let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
|
||||
let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
|
||||
timeline_path
|
||||
.strip_prefix(&test_state.harness.conf.workdir)
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
|
||||
|
||||
let index_path = test_state.harness.remote_fs_dir.join(
|
||||
remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
|
||||
);
|
||||
eprintln!("Writing {}", index_path.display());
|
||||
std::fs::write(&index_path, index_part_bytes).unwrap();
|
||||
example_index_part
|
||||
}
|
||||
|
||||
/// Assert that when a RemoteTimelineclient in generation `get_generation` fetches its
|
||||
/// index, the IndexPart returned is equal to `expected`
|
||||
async fn assert_got_index_part(
|
||||
test_state: &TestSetup,
|
||||
get_generation: Generation,
|
||||
expected: &IndexPart,
|
||||
) {
|
||||
let client = test_state.build_client(get_generation);
|
||||
|
||||
let download_r = client
|
||||
.download_index_file()
|
||||
.await
|
||||
.expect("download should always succeed");
|
||||
assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
|
||||
match download_r {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => {
|
||||
assert_eq!(&index_part, expected);
|
||||
}
|
||||
MaybeDeletedIndexPart::Deleted(_index_part) => panic!("Test doesn't set deleted_at"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn index_part_download_simple() -> anyhow::Result<()> {
|
||||
let test_state = TestSetup::new("index_part_download_simple").await.unwrap();
|
||||
let span = test_state.span();
|
||||
let _guard = span.enter();
|
||||
|
||||
// Simple case: we are in generation N, load the index from generation N - 1
|
||||
let generation_n = 5;
|
||||
let injected = inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
|
||||
|
||||
assert_got_index_part(&test_state, Generation::new(generation_n), &injected).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn index_part_download_ordering() -> anyhow::Result<()> {
|
||||
let test_state = TestSetup::new("index_part_download_ordering")
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let span = test_state.span();
|
||||
let _guard = span.enter();
|
||||
|
||||
// A generation-less IndexPart exists in the bucket, we should find it
|
||||
let generation_n = 5;
|
||||
let injected_none = inject_index_part(&test_state, Generation::none()).await;
|
||||
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_none).await;
|
||||
|
||||
// If a more recent-than-none generation exists, we should prefer to load that
|
||||
let injected_1 = inject_index_part(&test_state, Generation::new(1)).await;
|
||||
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
|
||||
|
||||
// If a more-recent-than-me generation exists, we should ignore it.
|
||||
let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await;
|
||||
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
|
||||
|
||||
// If a directly previous generation exists, _and_ an index exists in my own
|
||||
// generation, I should prefer my own generation.
|
||||
let _injected_prev =
|
||||
inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
|
||||
let injected_current = inject_index_part(&test_state, Generation::new(generation_n)).await;
|
||||
assert_got_index_part(
|
||||
&test_state,
|
||||
Generation::new(generation_n),
|
||||
&injected_current,
|
||||
)
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
);
|
||||
assert_eq!(
|
||||
post,
|
||||
BytesStartedFinished {
|
||||
started: Some(content_1.len()),
|
||||
finished: Some(content_1.len())
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,10 +24,7 @@ use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::{
|
||||
parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
};
|
||||
use super::{remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
|
||||
|
||||
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
||||
|
||||
@@ -222,13 +219,13 @@ pub async fn list_remote_timelines(
|
||||
Ok(timeline_ids)
|
||||
}
|
||||
|
||||
async fn do_download_index_part(
|
||||
pub(super) async fn download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
index_generation: Generation,
|
||||
generation: Generation,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
|
||||
let remote_path = remote_index_path(tenant_id, timeline_id, generation);
|
||||
|
||||
let index_part_bytes = download_retry(
|
||||
|| async {
|
||||
@@ -255,105 +252,6 @@ async fn do_download_index_part(
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
/// index_part.json objects are suffixed with a generation number, so we cannot
|
||||
/// directly GET the latest index part without doing some probing.
|
||||
///
|
||||
/// In this function we probe for the most recent index in a generation <= our current generation.
|
||||
/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
|
||||
pub(super) async fn download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
my_generation: Generation,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
if my_generation.is_none() {
|
||||
// Operating without generations: just fetch the generation-less path
|
||||
return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
|
||||
}
|
||||
|
||||
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
|
||||
// index in our generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
|
||||
match res {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!(
|
||||
"Found index_part from current generation (this is a stale attachment)"
|
||||
);
|
||||
return Ok(index_part);
|
||||
}
|
||||
Err(DownloadError::NotFound) => {}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
|
||||
// Typical case: the previous generation of this tenant was running healthily, and had uploaded
|
||||
// and index part. We may safely start from this index without doing a listing, because:
|
||||
// - We checked for current generation case above
|
||||
// - generations > my_generation are to be ignored
|
||||
// - any other indices that exist would have an older generation than `previous_gen`, and
|
||||
// we want to find the most recent index from a previous generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res =
|
||||
do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
|
||||
match res {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!("Found index_part from previous generation");
|
||||
return Ok(index_part);
|
||||
}
|
||||
Err(DownloadError::NotFound) => {
|
||||
tracing::debug!(
|
||||
"No index_part found from previous generation, falling back to listing"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
|
||||
// objects, and select the highest one with a generation <= my_generation.
|
||||
let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none());
|
||||
let indices = backoff::retry(
|
||||
|| async { storage.list_files(Some(&index_prefix)).await },
|
||||
|_| false,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"listing index_part files",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error {
|
||||
unreachable!()
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// General case logic for which index to use: the latest index whose generation
|
||||
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
let max_previous_generation = indices
|
||||
.into_iter()
|
||||
.filter_map(parse_remote_index_path)
|
||||
.filter(|g| g <= &my_generation)
|
||||
.max();
|
||||
|
||||
match max_previous_generation {
|
||||
Some(g) => {
|
||||
tracing::debug!("Found index_part in generation {g:?}");
|
||||
do_download_index_part(storage, tenant_id, timeline_id, g).await
|
||||
}
|
||||
None => {
|
||||
// Migration from legacy pre-generation state: we have a generation but no prior
|
||||
// attached pageservers did. Try to load from a no-generation path.
|
||||
tracing::info!("No index_part.json* found");
|
||||
do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to handle retries for a download operation.
|
||||
///
|
||||
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
|
||||
|
||||
@@ -31,7 +31,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
@@ -46,6 +46,7 @@ use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs::{self, File};
|
||||
use std::io::SeekFrom;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -218,7 +219,7 @@ pub struct DeltaLayerInner {
|
||||
index_root_blk: u32,
|
||||
|
||||
/// Reader object for reading blocks from the file.
|
||||
file: FileBlockReader,
|
||||
file: FileBlockReader<VirtualFile>,
|
||||
}
|
||||
|
||||
impl AsRef<DeltaLayerInner> for DeltaLayerInner {
|
||||
@@ -582,14 +583,14 @@ struct DeltaLayerWriterInner {
|
||||
|
||||
tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
|
||||
|
||||
blob_writer: BlobWriter<true>,
|
||||
blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
async fn new(
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
@@ -604,10 +605,11 @@ impl DeltaLayerWriterInner {
|
||||
// FIXME: throw an error instead?
|
||||
let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);
|
||||
|
||||
let mut file = VirtualFile::create(&path).await?;
|
||||
let mut file = VirtualFile::create(&path)?;
|
||||
// make room for the header block
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let blob_writer = WriteBlobWriter::new(buf_writer, PAGE_SZ as u64);
|
||||
|
||||
// Initialize the b-tree index builder
|
||||
let block_buf = BlockBuf::new();
|
||||
@@ -630,12 +632,11 @@ impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
|
||||
.await
|
||||
}
|
||||
|
||||
async fn put_value_bytes(
|
||||
fn put_value_bytes(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
@@ -644,7 +645,7 @@ impl DeltaLayerWriterInner {
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
|
||||
let off = self.blob_writer.write_blob(val).await?;
|
||||
let off = self.blob_writer.write_blob(val)?;
|
||||
|
||||
let blob_ref = BlobRef::new(off, will_init);
|
||||
|
||||
@@ -661,18 +662,18 @@ impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
let mut file = self.blob_writer.into_inner().await?;
|
||||
let buf_writer = self.blob_writer.into_inner();
|
||||
let mut file = buf_writer.into_inner()?;
|
||||
|
||||
// Write out the index
|
||||
let (index_root_blk, block_buf) = self.tree.finish()?;
|
||||
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
|
||||
.await?;
|
||||
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
|
||||
for buf in block_buf.blocks {
|
||||
file.write_all(buf.as_ref()).await?;
|
||||
file.write_all(buf.as_ref())?;
|
||||
}
|
||||
assert!(self.lsn_range.start < self.lsn_range.end);
|
||||
// Fill in the summary on blk 0
|
||||
@@ -686,22 +687,11 @@ impl DeltaLayerWriterInner {
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
};
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
Summary::ser_into(&summary, &mut buf)?;
|
||||
if buf.spilled() {
|
||||
// This is bad as we only have one free block for the summary
|
||||
warn!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
);
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
Summary::ser_into(&summary, &mut file)?;
|
||||
|
||||
let metadata = file
|
||||
.metadata()
|
||||
.await
|
||||
.context("get file metadata to determine size")?;
|
||||
|
||||
// 5GB limit for objects without multipart upload (which we don't want to use)
|
||||
@@ -732,7 +722,7 @@ impl DeltaLayerWriterInner {
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
file.sync_all()?;
|
||||
// Rename the file to its final name
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
@@ -784,7 +774,7 @@ impl DeltaLayerWriter {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
pub async fn new(
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
@@ -792,10 +782,13 @@ impl DeltaLayerWriter {
|
||||
lsn_range: Range<Lsn>,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
inner: Some(
|
||||
DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range)
|
||||
.await?,
|
||||
),
|
||||
inner: Some(DeltaLayerWriterInner::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_start,
|
||||
lsn_range,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -804,11 +797,11 @@ impl DeltaLayerWriter {
|
||||
///
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_value(key, lsn, val).await
|
||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_value(key, lsn, val)
|
||||
}
|
||||
|
||||
pub async fn put_value_bytes(
|
||||
pub fn put_value_bytes(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
@@ -819,7 +812,6 @@ impl DeltaLayerWriter {
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.put_value_bytes(key, lsn, val, will_init)
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
@@ -829,18 +821,21 @@ impl DeltaLayerWriter {
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
self.inner.take().unwrap().finish(key_end).await
|
||||
pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
self.inner.take().unwrap().finish(key_end)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DeltaLayerWriter {
|
||||
fn drop(&mut self) {
|
||||
if let Some(inner) = self.inner.take() {
|
||||
// We want to remove the virtual file here, so it's fine to not
|
||||
// having completely flushed unwritten data.
|
||||
let vfile = inner.blob_writer.into_inner_no_flush();
|
||||
vfile.remove();
|
||||
match inner.blob_writer.into_inner().into_inner() {
|
||||
Ok(vfile) => vfile.remove(),
|
||||
Err(err) => warn!(
|
||||
"error while flushing buffer of image layer temporary file: {}",
|
||||
err
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -851,7 +846,6 @@ impl DeltaLayerInner {
|
||||
summary: Option<Summary>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
@@ -43,6 +43,7 @@ use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs::{self, File};
|
||||
use std::io::SeekFrom;
|
||||
use std::io::Write;
|
||||
use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -154,7 +155,7 @@ pub struct ImageLayerInner {
|
||||
lsn: Lsn,
|
||||
|
||||
/// Reader object for reading blocks from the file.
|
||||
file: FileBlockReader,
|
||||
file: FileBlockReader<VirtualFile>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayerInner {
|
||||
@@ -438,7 +439,6 @@ impl ImageLayerInner {
|
||||
summary: Option<Summary>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = file.read_blk(0).await?;
|
||||
@@ -511,7 +511,7 @@ struct ImageLayerWriterInner {
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
blob_writer: BlobWriter<false>,
|
||||
blob_writer: WriteBlobWriter<VirtualFile>,
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
}
|
||||
|
||||
@@ -519,7 +519,7 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
async fn new(
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
@@ -541,11 +541,10 @@ impl ImageLayerWriterInner {
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
&path,
|
||||
std::fs::OpenOptions::new().write(true).create_new(true),
|
||||
)
|
||||
.await?;
|
||||
)?;
|
||||
// make room for the header block
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
|
||||
let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
|
||||
|
||||
// Initialize the b-tree index builder
|
||||
let block_buf = BlockBuf::new();
|
||||
@@ -570,9 +569,9 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let off = self.blob_writer.write_blob(img).await?;
|
||||
let off = self.blob_writer.write_blob(img)?;
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
@@ -584,18 +583,17 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
async fn finish(self) -> anyhow::Result<ImageLayer> {
|
||||
fn finish(self) -> anyhow::Result<ImageLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
let mut file = self.blob_writer.into_inner();
|
||||
|
||||
// Write out the index
|
||||
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
|
||||
.await?;
|
||||
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
|
||||
let (index_root_blk, block_buf) = self.tree.finish()?;
|
||||
for buf in block_buf.blocks {
|
||||
file.write_all(buf.as_ref()).await?;
|
||||
file.write_all(buf.as_ref())?;
|
||||
}
|
||||
|
||||
// Fill in the summary on blk 0
|
||||
@@ -609,22 +607,11 @@ impl ImageLayerWriterInner {
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
};
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
Summary::ser_into(&summary, &mut buf)?;
|
||||
if buf.spilled() {
|
||||
// This is bad as we only have one free block for the summary
|
||||
warn!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
);
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
Summary::ser_into(&summary, &mut file)?;
|
||||
|
||||
let metadata = file
|
||||
.metadata()
|
||||
.await
|
||||
.context("get metadata to determine file size")?;
|
||||
|
||||
let desc = PersistentLayerDesc::new_img(
|
||||
@@ -647,7 +634,7 @@ impl ImageLayerWriterInner {
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
file.sync_all()?;
|
||||
|
||||
// Rename the file to its final name
|
||||
//
|
||||
@@ -700,7 +687,7 @@ impl ImageLayerWriter {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
pub async fn new(
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
@@ -708,9 +695,13 @@ impl ImageLayerWriter {
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
Ok(Self {
|
||||
inner: Some(
|
||||
ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
|
||||
),
|
||||
inner: Some(ImageLayerWriterInner::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_range,
|
||||
lsn,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -719,15 +710,15 @@ impl ImageLayerWriter {
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_image(key, img).await
|
||||
pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_image(key, img)
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
|
||||
self.inner.take().unwrap().finish().await
|
||||
pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
|
||||
self.inner.take().unwrap().finish()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -236,7 +236,7 @@ impl InMemoryLayer {
|
||||
///
|
||||
/// Create a new, empty, in-memory layer
|
||||
///
|
||||
pub async fn create(
|
||||
pub fn create(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
@@ -244,7 +244,7 @@ impl InMemoryLayer {
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||
|
||||
let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
|
||||
let file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
@@ -333,8 +333,7 @@ impl InMemoryLayer {
|
||||
self.tenant_id,
|
||||
Key::MIN,
|
||||
self.start_lsn..end_lsn,
|
||||
)
|
||||
.await?;
|
||||
)?;
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
@@ -349,13 +348,11 @@ impl InMemoryLayer {
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
delta_layer_writer
|
||||
.put_value_bytes(key, *lsn, &buf, will_init)
|
||||
.await?;
|
||||
delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
|
||||
}
|
||||
}
|
||||
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX).await?;
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX)?;
|
||||
Ok(delta_layer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,7 +102,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
let started_at = Instant::now();
|
||||
|
||||
let sleep_duration = if period == Duration::ZERO {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
info!("automatic compaction is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
Duration::from_secs(10)
|
||||
@@ -167,7 +166,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
Duration::from_secs(10)
|
||||
|
||||
@@ -90,7 +90,6 @@ use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::config::TenantConf;
|
||||
use super::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use super::remote_timeline_client::index::IndexPart;
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
use super::storage_layer::{
|
||||
@@ -934,48 +933,6 @@ impl Timeline {
|
||||
self.launch_eviction_task(background_jobs_can_start);
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
|
||||
pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// prevent writes to the InMemoryLayer
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::WalReceiverManager),
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
// now all writers to InMemory layer are gone, do the final flush if requested
|
||||
if freeze_and_flush {
|
||||
match self.freeze_and_flush().await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
warn!("failed to freeze and flush: {e:#}");
|
||||
return; // TODO: should probably drain remote timeline client anyways?
|
||||
}
|
||||
}
|
||||
|
||||
// drain the upload queue
|
||||
let res = if let Some(client) = self.remote_client.as_ref() {
|
||||
// if we did not wait for completion here, it might be our shutdown process
|
||||
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
||||
// be spawned.
|
||||
//
|
||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||
// obviously it does not make sense to stop while we wait for it, but what
|
||||
// about corner cases like s3 suddenly hanging up?
|
||||
client.wait_completion().await
|
||||
} else {
|
||||
Ok(())
|
||||
};
|
||||
|
||||
if let Err(e) = res {
|
||||
warn!("failed to await for frozen and flushed uploads: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_state(&self, new_state: TimelineState) {
|
||||
match (self.current_state(), new_state) {
|
||||
(equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
|
||||
@@ -1724,18 +1681,11 @@ impl Timeline {
|
||||
for (name, decision) in decided {
|
||||
let decision = match decision {
|
||||
Ok(UseRemote { local, remote }) => {
|
||||
// Remote is authoritative, but we may still choose to retain
|
||||
// the local file if the contents appear to match
|
||||
if local.file_size() == remote.file_size() {
|
||||
// Use the local file, but take the remote metadata so that we pick up
|
||||
// the correct generation.
|
||||
UseLocal(remote)
|
||||
} else {
|
||||
path.push(name.file_name());
|
||||
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
|
||||
path.pop();
|
||||
UseRemote { local, remote }
|
||||
}
|
||||
path.push(name.file_name());
|
||||
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
|
||||
path.pop();
|
||||
|
||||
UseRemote { local, remote }
|
||||
}
|
||||
Ok(decision) => decision,
|
||||
Err(FutureLayer { local }) => {
|
||||
@@ -2544,15 +2494,13 @@ impl Timeline {
|
||||
///
|
||||
async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
|
||||
let mut guard = self.layers.write().await;
|
||||
let layer = guard
|
||||
.get_layer_for_write(
|
||||
lsn,
|
||||
self.get_last_record_lsn(),
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
)
|
||||
.await?;
|
||||
let layer = guard.get_layer_for_write(
|
||||
lsn,
|
||||
self.get_last_record_lsn(),
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
)?;
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
@@ -2837,9 +2785,15 @@ impl Timeline {
|
||||
x.unwrap()
|
||||
));
|
||||
|
||||
save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
|
||||
.await
|
||||
.context("save_metadata")?;
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
&metadata,
|
||||
false,
|
||||
)
|
||||
.await
|
||||
.context("save_metadata")?;
|
||||
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
for (path, layer_metadata) in layer_paths_to_upload {
|
||||
@@ -3042,8 +2996,7 @@ impl Timeline {
|
||||
self.tenant_id,
|
||||
&img_range,
|
||||
lsn,
|
||||
)
|
||||
.await?;
|
||||
)?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
@@ -3079,11 +3032,11 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
};
|
||||
image_layer_writer.put_image(key, &img).await?;
|
||||
image_layer_writer.put_image(key, &img)?;
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
let image_layer = image_layer_writer.finish().await?;
|
||||
let image_layer = image_layer_writer.finish()?;
|
||||
image_layers.push(image_layer);
|
||||
}
|
||||
}
|
||||
@@ -3628,11 +3581,7 @@ impl Timeline {
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(Arc::new(
|
||||
writer
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(prev_key.unwrap().next())
|
||||
.await?,
|
||||
writer.take().unwrap().finish(prev_key.unwrap().next())?,
|
||||
));
|
||||
writer = None;
|
||||
|
||||
@@ -3647,23 +3596,20 @@ impl Timeline {
|
||||
}
|
||||
if writer.is_none() {
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(
|
||||
DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
if dup_end_lsn.is_valid() {
|
||||
// this is a layer containing slice of values of the same key
|
||||
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||
dup_start_lsn..dup_end_lsn
|
||||
} else {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
if dup_end_lsn.is_valid() {
|
||||
// this is a layer containing slice of values of the same key
|
||||
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||
dup_start_lsn..dup_end_lsn
|
||||
} else {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
)?);
|
||||
}
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
@@ -3672,11 +3618,11 @@ impl Timeline {
|
||||
)))
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
}
|
||||
if let Some(writer) = writer {
|
||||
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next()).await?));
|
||||
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
|
||||
}
|
||||
|
||||
// Sync layers
|
||||
@@ -4799,8 +4745,22 @@ mod tests {
|
||||
let harness =
|
||||
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
|
||||
|
||||
let remote_storage = {
|
||||
// this is never used for anything, because of how the create_test_timeline works, but
|
||||
// it is with us in spirit and a Some.
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
|
||||
let path = harness.conf.workdir.join("localfs");
|
||||
std::fs::create_dir_all(&path).unwrap();
|
||||
let config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(path),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config).unwrap()
|
||||
};
|
||||
|
||||
let ctx = any_context();
|
||||
let tenant = harness.try_load(&ctx).await.unwrap();
|
||||
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
@@ -4850,8 +4810,22 @@ mod tests {
|
||||
async fn layer_eviction_aba_fails() {
|
||||
let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap();
|
||||
|
||||
let remote_storage = {
|
||||
// this is never used for anything, because of how the create_test_timeline works, but
|
||||
// it is with us in spirit and a Some.
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
|
||||
let path = harness.conf.workdir.join("localfs");
|
||||
std::fs::create_dir_all(&path).unwrap();
|
||||
let config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(path),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config).unwrap()
|
||||
};
|
||||
|
||||
let ctx = any_context();
|
||||
let tenant = harness.try_load(&ctx).await.unwrap();
|
||||
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
|
||||
@@ -147,7 +147,11 @@ pub(super) fn reconcile(
|
||||
Err(FutureLayer { local })
|
||||
} else {
|
||||
Ok(match (local, remote) {
|
||||
(Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
|
||||
(Some(local), Some(remote)) if local != remote => {
|
||||
assert_eq!(local.generation, remote.generation);
|
||||
|
||||
UseRemote { local, remote }
|
||||
}
|
||||
(Some(x), Some(_)) => UseLocal(x),
|
||||
(None, Some(x)) => Evicted(x),
|
||||
(Some(x), None) => NeedsUpload(x),
|
||||
|
||||
@@ -87,7 +87,7 @@ impl LayerManager {
|
||||
|
||||
/// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
|
||||
/// called within `get_layer_for_write`.
|
||||
pub(crate) async fn get_layer_for_write(
|
||||
pub(crate) fn get_layer_for_write(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
last_record_lsn: Lsn,
|
||||
@@ -129,7 +129,7 @@ impl LayerManager {
|
||||
lsn
|
||||
);
|
||||
|
||||
let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?;
|
||||
let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn)?;
|
||||
let layer = Arc::new(new_layer);
|
||||
|
||||
self.layer_map.open_layer = Some(layer.clone());
|
||||
|
||||
@@ -11,15 +11,13 @@
|
||||
//! src/backend/storage/file/fd.c
|
||||
//!
|
||||
use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME};
|
||||
use crate::tenant::TENANTS_SEGMENT_NAME;
|
||||
use futures::Future;
|
||||
use once_cell::sync::OnceCell;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom, Write};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||
use std::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
@@ -111,7 +109,7 @@ impl OpenFiles {
|
||||
///
|
||||
/// On return, we hold a lock on the slot, and its 'tag' has been updated
|
||||
/// recently_used has been set. It's all ready for reuse.
|
||||
async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
|
||||
fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
|
||||
//
|
||||
// Run the clock algorithm to find a slot to replace.
|
||||
//
|
||||
@@ -143,7 +141,7 @@ impl OpenFiles {
|
||||
}
|
||||
retries += 1;
|
||||
} else {
|
||||
slot_guard = slot.inner.write().await;
|
||||
slot_guard = slot.inner.write().unwrap();
|
||||
index = next;
|
||||
break;
|
||||
}
|
||||
@@ -174,55 +172,19 @@ impl OpenFiles {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum CrashsafeOverwriteError {
|
||||
#[error("final path has no parent dir")]
|
||||
FinalPathHasNoParentDir,
|
||||
#[error("remove tempfile: {0}")]
|
||||
RemovePreviousTempfile(#[source] std::io::Error),
|
||||
#[error("create tempfile: {0}")]
|
||||
CreateTempfile(#[source] std::io::Error),
|
||||
#[error("write tempfile: {0}")]
|
||||
WriteContents(#[source] std::io::Error),
|
||||
#[error("sync tempfile: {0}")]
|
||||
SyncTempfile(#[source] std::io::Error),
|
||||
#[error("rename tempfile to final path: {0}")]
|
||||
RenameTempfileToFinalPath(#[source] std::io::Error),
|
||||
#[error("open final path parent dir: {0}")]
|
||||
OpenFinalPathParentDir(#[source] std::io::Error),
|
||||
#[error("sync final path parent dir: {0}")]
|
||||
SyncFinalPathParentDir(#[source] std::io::Error),
|
||||
}
|
||||
impl CrashsafeOverwriteError {
|
||||
/// Returns true iff the new contents are durably stored.
|
||||
pub fn are_new_contents_durable(&self) -> bool {
|
||||
match self {
|
||||
Self::FinalPathHasNoParentDir => false,
|
||||
Self::RemovePreviousTempfile(_) => false,
|
||||
Self::CreateTempfile(_) => false,
|
||||
Self::WriteContents(_) => false,
|
||||
Self::SyncTempfile(_) => false,
|
||||
Self::RenameTempfileToFinalPath(_) => false,
|
||||
Self::OpenFinalPathParentDir(_) => false,
|
||||
Self::SyncFinalPathParentDir(_) => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl VirtualFile {
|
||||
/// Open a file in read-only mode. Like File::open.
|
||||
pub async fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
|
||||
Self::open_with_options(path, OpenOptions::new().read(true)).await
|
||||
pub fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
|
||||
Self::open_with_options(path, OpenOptions::new().read(true))
|
||||
}
|
||||
|
||||
/// Create a new file for writing. If the file exists, it will be truncated.
|
||||
/// Like File::create.
|
||||
pub async fn create(path: &Path) -> Result<VirtualFile, std::io::Error> {
|
||||
pub fn create(path: &Path) -> Result<VirtualFile, std::io::Error> {
|
||||
Self::open_with_options(
|
||||
path,
|
||||
OpenOptions::new().write(true).create(true).truncate(true),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Open a file with given options.
|
||||
@@ -230,7 +192,7 @@ impl VirtualFile {
|
||||
/// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
|
||||
/// they will be applied also when the file is subsequently re-opened, not only
|
||||
/// on the first time. Make sure that's sane!
|
||||
pub async fn open_with_options(
|
||||
pub fn open_with_options(
|
||||
path: &Path,
|
||||
open_options: &OpenOptions,
|
||||
) -> Result<VirtualFile, std::io::Error> {
|
||||
@@ -238,14 +200,14 @@ impl VirtualFile {
|
||||
let parts = path_str.split('/').collect::<Vec<&str>>();
|
||||
let tenant_id;
|
||||
let timeline_id;
|
||||
if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
|
||||
if parts.len() > 5 && parts[parts.len() - 5] == "tenants" {
|
||||
tenant_id = parts[parts.len() - 4].to_string();
|
||||
timeline_id = parts[parts.len() - 2].to_string();
|
||||
} else {
|
||||
tenant_id = "*".to_string();
|
||||
timeline_id = "*".to_string();
|
||||
}
|
||||
let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
|
||||
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
|
||||
let file = STORAGE_IO_TIME
|
||||
.with_label_values(&["open"])
|
||||
.observe_closure_duration(|| open_options.open(path))?;
|
||||
@@ -274,79 +236,21 @@ impl VirtualFile {
|
||||
Ok(vfile)
|
||||
}
|
||||
|
||||
/// Writes a file to the specified `final_path` in a crash safe fasion
|
||||
///
|
||||
/// The file is first written to the specified tmp_path, and in a second
|
||||
/// step, the tmp path is renamed to the final path. As renames are
|
||||
/// atomic, a crash during the write operation will never leave behind a
|
||||
/// partially written file.
|
||||
pub async fn crashsafe_overwrite(
|
||||
final_path: &Path,
|
||||
tmp_path: &Path,
|
||||
content: &[u8],
|
||||
) -> Result<(), CrashsafeOverwriteError> {
|
||||
let Some(final_path_parent) = final_path.parent() else {
|
||||
return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
|
||||
};
|
||||
match std::fs::remove_file(tmp_path) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
|
||||
}
|
||||
let mut file = Self::open_with_options(
|
||||
tmp_path,
|
||||
OpenOptions::new()
|
||||
.write(true)
|
||||
// Use `create_new` so that, if we race with ourselves or something else,
|
||||
// we bail out instead of causing damage.
|
||||
.create_new(true),
|
||||
)
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::CreateTempfile)?;
|
||||
file.write_all(content)
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::WriteContents)?;
|
||||
file.sync_all()
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::SyncTempfile)?;
|
||||
drop(file); // before the rename, that's important!
|
||||
// renames are atomic
|
||||
std::fs::rename(tmp_path, final_path)
|
||||
.map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
|
||||
// Only open final path parent dirfd now, so that this operation only
|
||||
// ever holds one VirtualFile fd at a time. That's important because
|
||||
// the current `find_victim_slot` impl might pick the same slot for both
|
||||
// VirtualFile., and it eventually does a blocking write lock instead of
|
||||
// try_lock.
|
||||
let final_parent_dirfd =
|
||||
Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
|
||||
final_parent_dirfd
|
||||
.sync_all()
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Call File::sync_all() on the underlying File.
|
||||
pub async fn sync_all(&self) -> Result<(), Error> {
|
||||
self.with_file("fsync", |file| async move { file.sync_all() })
|
||||
.await?
|
||||
pub fn sync_all(&self) -> Result<(), Error> {
|
||||
self.with_file("fsync", |file| file.sync_all())?
|
||||
}
|
||||
|
||||
pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
|
||||
self.with_file("metadata", |file| async move { file.metadata() })
|
||||
.await?
|
||||
pub fn metadata(&self) -> Result<fs::Metadata, Error> {
|
||||
self.with_file("metadata", |file| file.metadata())?
|
||||
}
|
||||
|
||||
/// Helper function that looks up the underlying File for this VirtualFile,
|
||||
/// opening it and evicting some other File if necessary. It calls 'func'
|
||||
/// with the physical File.
|
||||
async fn with_file<F, R, FR>(&self, _op: &str, func: F) -> Result<R, Error>
|
||||
fn with_file<F, R>(&self, op: &str, mut func: F) -> Result<R, Error>
|
||||
where
|
||||
F: FnOnce(&File) -> FR,
|
||||
FR: Future<Output = R>,
|
||||
F: FnMut(&File) -> R,
|
||||
{
|
||||
let open_files = get_open_files();
|
||||
|
||||
@@ -357,17 +261,19 @@ impl VirtualFile {
|
||||
// We only need to hold the handle lock while we read the current handle. If
|
||||
// another thread closes the file and recycles the slot for a different file,
|
||||
// we will notice that the handle we read is no longer valid and retry.
|
||||
let mut handle = *self.handle.read().await;
|
||||
let mut handle = *self.handle.read().unwrap();
|
||||
loop {
|
||||
// Check if the slot contains our File
|
||||
{
|
||||
let slot = &open_files.slots[handle.index];
|
||||
let slot_guard = slot.inner.read().await;
|
||||
let slot_guard = slot.inner.read().unwrap();
|
||||
if slot_guard.tag == handle.tag {
|
||||
if let Some(file) = &slot_guard.file {
|
||||
// Found a cached file descriptor.
|
||||
slot.recently_used.store(true, Ordering::Relaxed);
|
||||
return Ok(func(file).await);
|
||||
return Ok(STORAGE_IO_TIME
|
||||
.with_label_values(&[op])
|
||||
.observe_closure_duration(|| func(file)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -375,7 +281,7 @@ impl VirtualFile {
|
||||
// The slot didn't contain our File. We will have to open it ourselves,
|
||||
// but before that, grab a write lock on handle in the VirtualFile, so
|
||||
// that no other thread will try to concurrently open the same file.
|
||||
let handle_guard = self.handle.write().await;
|
||||
let handle_guard = self.handle.write().unwrap();
|
||||
|
||||
// If another thread changed the handle while we were not holding the lock,
|
||||
// then the handle might now be valid again. Loop back to retry.
|
||||
@@ -389,7 +295,7 @@ impl VirtualFile {
|
||||
|
||||
// We need to open the file ourselves. The handle in the VirtualFile is
|
||||
// now locked in write-mode. Find a free slot to put it in.
|
||||
let (handle, mut slot_guard) = open_files.find_victim_slot().await;
|
||||
let (handle, mut slot_guard) = open_files.find_victim_slot();
|
||||
|
||||
// Open the physical file
|
||||
let file = STORAGE_IO_TIME
|
||||
@@ -397,7 +303,9 @@ impl VirtualFile {
|
||||
.observe_closure_duration(|| self.open_options.open(&self.path))?;
|
||||
|
||||
// Perform the requested operation on it
|
||||
let result = func(&file).await;
|
||||
let result = STORAGE_IO_TIME
|
||||
.with_label_values(&[op])
|
||||
.observe_closure_duration(|| func(&file));
|
||||
|
||||
// Store the File in the slot and update the handle in the VirtualFile
|
||||
// to point to it.
|
||||
@@ -414,17 +322,13 @@ impl VirtualFile {
|
||||
std::fs::remove_file(path).expect("failed to remove the virtual file");
|
||||
}
|
||||
|
||||
pub async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
pub fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
match pos {
|
||||
SeekFrom::Start(offset) => {
|
||||
self.pos = offset;
|
||||
}
|
||||
SeekFrom::End(offset) => {
|
||||
self.pos = self
|
||||
.with_file("seek", |mut file| async move {
|
||||
file.seek(SeekFrom::End(offset))
|
||||
})
|
||||
.await??
|
||||
self.pos = self.with_file("seek", |mut file| file.seek(SeekFrom::End(offset)))??
|
||||
}
|
||||
SeekFrom::Current(offset) => {
|
||||
let pos = self.pos as i128 + offset as i128;
|
||||
@@ -467,7 +371,7 @@ impl VirtualFile {
|
||||
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
|
||||
pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> {
|
||||
while !buf.is_empty() {
|
||||
match self.write_at(buf, offset).await {
|
||||
match self.write_at(buf, offset) {
|
||||
Ok(0) => {
|
||||
return Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
@@ -485,36 +389,22 @@ impl VirtualFile {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> {
|
||||
while !buf.is_empty() {
|
||||
match self.write(buf).await {
|
||||
Ok(0) => {
|
||||
return Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
));
|
||||
}
|
||||
Ok(n) => {
|
||||
buf = &buf[n..];
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
/// Write the given buffer (which has to be below the kernel's internal page size) and fsync
|
||||
///
|
||||
/// This ensures some level of atomicity (not a good one, but it's the best we have).
|
||||
pub fn write_and_fsync(&mut self, buf: &[u8]) -> Result<(), Error> {
|
||||
if self.write(buf)? != buf.len() {
|
||||
return Err(Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"Could not write all the bytes in a single call",
|
||||
));
|
||||
}
|
||||
self.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
|
||||
let pos = self.pos;
|
||||
let n = self.write_at(buf, pos).await?;
|
||||
self.pos += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
|
||||
let result = self
|
||||
.with_file("read", |file| async move { file.read_at(buf, offset) })
|
||||
.await?;
|
||||
async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
|
||||
let result = self.with_file("read", |file| file.read_at(buf, offset))?;
|
||||
if let Ok(size) = result {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&["read", &self.tenant_id, &self.timeline_id])
|
||||
@@ -523,10 +413,8 @@ impl VirtualFile {
|
||||
result
|
||||
}
|
||||
|
||||
async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
|
||||
let result = self
|
||||
.with_file("write", |file| async move { file.write_at(buf, offset) })
|
||||
.await?;
|
||||
pub fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
|
||||
let result = self.with_file("write", |file| file.write_at(buf, offset))?;
|
||||
if let Ok(size) = result {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&["write", &self.tenant_id, &self.timeline_id])
|
||||
@@ -536,51 +424,15 @@ impl VirtualFile {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl VirtualFile {
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
let mut buf = [0; PAGE_SZ];
|
||||
self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
|
||||
.await?;
|
||||
Ok(std::sync::Arc::new(buf).into())
|
||||
}
|
||||
|
||||
async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
|
||||
loop {
|
||||
let mut tmp = [0; 128];
|
||||
match self.read_at(&mut tmp, self.pos).await {
|
||||
Ok(0) => return Ok(()),
|
||||
Ok(n) => {
|
||||
self.pos += n as u64;
|
||||
buf.extend_from_slice(&tmp[..n]);
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for VirtualFile {
|
||||
/// If a VirtualFile is dropped, close the underlying file if it was open.
|
||||
fn drop(&mut self) {
|
||||
let handle = self.handle.get_mut();
|
||||
let handle = self.handle.get_mut().unwrap();
|
||||
|
||||
// We don't have async drop so we cannot wait for the lock here.
|
||||
// Instead, do a best-effort attempt at closing the underlying
|
||||
// file descriptor by using `try_write`.
|
||||
// This best-effort attempt should be quite good though
|
||||
// as we have `&mut self` access. In other words, if the slot
|
||||
// is still occupied by our file, we should be the only ones
|
||||
// accessing it (and if it has been reassigned since, we don't
|
||||
// need to bother with dropping anyways).
|
||||
// We could check with a read-lock first, to avoid waiting on an
|
||||
// unrelated I/O.
|
||||
let slot = &get_open_files().slots[handle.index];
|
||||
let Ok(mut slot_guard) = slot.inner.try_write() else { return };
|
||||
|
||||
let mut slot_guard = slot.inner.write().unwrap();
|
||||
if slot_guard.tag == handle.tag {
|
||||
slot.recently_used.store(false, Ordering::Relaxed);
|
||||
// there is also operation "close-by-replace" for closes done on eviction for
|
||||
@@ -592,6 +444,21 @@ impl Drop for VirtualFile {
|
||||
}
|
||||
}
|
||||
|
||||
impl Write for VirtualFile {
|
||||
fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
|
||||
let pos = self.pos;
|
||||
let n = self.write_at(buf, pos)?;
|
||||
self.pos += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> Result<(), std::io::Error> {
|
||||
// flush is no-op for File (at least on unix), so we don't need to do
|
||||
// anything here either.
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl OpenFiles {
|
||||
fn new(num_slots: usize) -> OpenFiles {
|
||||
let mut slots = Box::new(Vec::with_capacity(num_slots));
|
||||
@@ -645,8 +512,6 @@ mod tests {
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use rand::Rng;
|
||||
use std::future::Future;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
|
||||
enum MaybeVirtualFile {
|
||||
@@ -667,15 +532,15 @@ mod tests {
|
||||
MaybeVirtualFile::File(file) => file.write_all_at(buf, offset),
|
||||
}
|
||||
}
|
||||
async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => file.seek(pos).await,
|
||||
MaybeVirtualFile::VirtualFile(file) => file.seek(pos),
|
||||
MaybeVirtualFile::File(file) => file.seek(pos),
|
||||
}
|
||||
}
|
||||
async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> {
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await,
|
||||
MaybeVirtualFile::VirtualFile(file) => file.write_all(buf),
|
||||
MaybeVirtualFile::File(file) => file.write_all(buf),
|
||||
}
|
||||
}
|
||||
@@ -687,9 +552,10 @@ mod tests {
|
||||
let mut buf = String::new();
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => {
|
||||
let mut buf = Vec::new();
|
||||
file.read_to_end(&mut buf).await?;
|
||||
return Ok(String::from_utf8(buf).unwrap());
|
||||
let pos = file.seek(SeekFrom::Current(0))?;
|
||||
let len = file.metadata()?.len().saturating_sub(pos);
|
||||
let len_usize = len.try_into().unwrap();
|
||||
return self.read_string_at(pos, len_usize).await;
|
||||
}
|
||||
MaybeVirtualFile::File(file) => {
|
||||
file.read_to_string(&mut buf)?;
|
||||
@@ -700,7 +566,8 @@ mod tests {
|
||||
|
||||
// Helper function to slurp a portion of a file into a string
|
||||
async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
|
||||
let mut buf = vec![0; len];
|
||||
let mut buf = Vec::new();
|
||||
buf.resize(len, 0);
|
||||
self.read_exact_at(&mut buf, pos).await?;
|
||||
Ok(String::from_utf8(buf).unwrap())
|
||||
}
|
||||
@@ -715,8 +582,8 @@ mod tests {
|
||||
// results with VirtualFiles as with native Files. (Except that with
|
||||
// native files, you will run out of file descriptors if the ulimit
|
||||
// is low enough.)
|
||||
test_files("virtual_files", |path, open_options| async move {
|
||||
let vf = VirtualFile::open_with_options(&path, &open_options).await?;
|
||||
test_files("virtual_files", |path, open_options| {
|
||||
let vf = VirtualFile::open_with_options(path, open_options)?;
|
||||
Ok(MaybeVirtualFile::VirtualFile(vf))
|
||||
})
|
||||
.await
|
||||
@@ -724,40 +591,34 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_physical_files() -> Result<(), Error> {
|
||||
test_files("physical_files", |path, open_options| async move {
|
||||
test_files("physical_files", |path, open_options| {
|
||||
Ok(MaybeVirtualFile::File(open_options.open(path)?))
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> Result<(), Error>
|
||||
async fn test_files<OF>(testname: &str, openfunc: OF) -> Result<(), Error>
|
||||
where
|
||||
OF: Fn(PathBuf, OpenOptions) -> FT,
|
||||
FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
|
||||
OF: Fn(&Path, &OpenOptions) -> Result<MaybeVirtualFile, std::io::Error>,
|
||||
{
|
||||
let testdir = crate::config::PageServerConf::test_repo_dir(testname);
|
||||
std::fs::create_dir_all(&testdir)?;
|
||||
|
||||
let path_a = testdir.join("file_a");
|
||||
let mut file_a = openfunc(
|
||||
path_a.clone(),
|
||||
OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.to_owned(),
|
||||
)
|
||||
.await?;
|
||||
&path_a,
|
||||
OpenOptions::new().write(true).create(true).truncate(true),
|
||||
)?;
|
||||
file_a.write_all(b"foobar").await?;
|
||||
|
||||
// cannot read from a file opened in write-only mode
|
||||
let _ = file_a.read_string().await.unwrap_err();
|
||||
assert!(file_a.read_string().await.is_err());
|
||||
|
||||
// Close the file and re-open for reading
|
||||
let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
|
||||
let mut file_a = openfunc(&path_a, OpenOptions::new().read(true))?;
|
||||
|
||||
// cannot write to a file opened in read-only mode
|
||||
let _ = file_a.write_all(b"bar").await.unwrap_err();
|
||||
assert!(file_a.write_all(b"bar").await.is_err());
|
||||
|
||||
// Try simple read
|
||||
assert_eq!("foobar", file_a.read_string().await?);
|
||||
@@ -766,23 +627,23 @@ mod tests {
|
||||
assert_eq!("", file_a.read_string().await?);
|
||||
|
||||
// Test seeks.
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
assert_eq!("oobar", file_a.read_string().await?);
|
||||
|
||||
assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4);
|
||||
assert_eq!(file_a.seek(SeekFrom::End(-2))?, 4);
|
||||
assert_eq!("ar", file_a.read_string().await?);
|
||||
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
|
||||
assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3);
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
assert_eq!(file_a.seek(SeekFrom::Current(2))?, 3);
|
||||
assert_eq!("bar", file_a.read_string().await?);
|
||||
|
||||
assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1);
|
||||
assert_eq!(file_a.seek(SeekFrom::Current(-5))?, 1);
|
||||
assert_eq!("oobar", file_a.read_string().await?);
|
||||
|
||||
// Test erroneous seeks to before byte 0
|
||||
file_a.seek(SeekFrom::End(-7)).await.unwrap_err();
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
|
||||
file_a.seek(SeekFrom::Current(-2)).await.unwrap_err();
|
||||
assert!(file_a.seek(SeekFrom::End(-7)).is_err());
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
assert!(file_a.seek(SeekFrom::Current(-2)).is_err());
|
||||
|
||||
// the erroneous seek should have left the position unchanged
|
||||
assert_eq!("oobar", file_a.read_string().await?);
|
||||
@@ -790,15 +651,13 @@ mod tests {
|
||||
// Create another test file, and try FileExt functions on it.
|
||||
let path_b = testdir.join("file_b");
|
||||
let mut file_b = openfunc(
|
||||
path_b.clone(),
|
||||
&path_b,
|
||||
OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.to_owned(),
|
||||
)
|
||||
.await?;
|
||||
.truncate(true),
|
||||
)?;
|
||||
file_b.write_all_at(b"BAR", 3).await?;
|
||||
file_b.write_all_at(b"FOO", 0).await?;
|
||||
|
||||
@@ -808,12 +667,11 @@ mod tests {
|
||||
// open the same file many times. The effect is the same.)
|
||||
//
|
||||
// leave file_a positioned at offset 1 before we start
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
|
||||
let mut vfiles = Vec::new();
|
||||
for _ in 0..100 {
|
||||
let mut vfile =
|
||||
openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?;
|
||||
let mut vfile = openfunc(&path_b, OpenOptions::new().read(true))?;
|
||||
assert_eq!("FOOBAR", vfile.read_string().await?);
|
||||
vfiles.push(vfile);
|
||||
}
|
||||
@@ -838,8 +696,8 @@ mod tests {
|
||||
/// Test using VirtualFiles from many threads concurrently. This tests both using
|
||||
/// a lot of VirtualFiles concurrently, causing evictions, and also using the same
|
||||
/// VirtualFile from multiple threads concurrently.
|
||||
#[tokio::test]
|
||||
async fn test_vfile_concurrency() -> Result<(), Error> {
|
||||
#[test]
|
||||
fn test_vfile_concurrency() -> Result<(), Error> {
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const VIRTUAL_FILES: usize = 100;
|
||||
const THREADS: usize = 100;
|
||||
@@ -858,8 +716,7 @@ mod tests {
|
||||
// Open the file many times.
|
||||
let mut files = Vec::new();
|
||||
for _ in 0..VIRTUAL_FILES {
|
||||
let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))
|
||||
.await?;
|
||||
let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))?;
|
||||
files.push(f);
|
||||
}
|
||||
let files = Arc::new(files);
|
||||
@@ -870,10 +727,9 @@ mod tests {
|
||||
.thread_name("test_vfile_concurrency thread")
|
||||
.build()
|
||||
.unwrap();
|
||||
let mut hdls = Vec::new();
|
||||
for _threadno in 0..THREADS {
|
||||
let files = files.clone();
|
||||
let hdl = rt.spawn(async move {
|
||||
rt.spawn(async move {
|
||||
let mut buf = [0u8; SIZE];
|
||||
let mut rng = rand::rngs::OsRng;
|
||||
for _ in 1..1000 {
|
||||
@@ -882,12 +738,7 @@ mod tests {
|
||||
assert!(buf == SAMPLE);
|
||||
}
|
||||
});
|
||||
hdls.push(hdl);
|
||||
}
|
||||
for hdl in hdls {
|
||||
hdl.await?;
|
||||
}
|
||||
std::mem::forget(rt);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -83,12 +83,11 @@ typedef struct FileCacheControl
|
||||
} FileCacheControl;
|
||||
|
||||
static HTAB* lfc_hash;
|
||||
static int lfc_desc = 0;
|
||||
static int lfc_desc;
|
||||
static LWLockId lfc_lock;
|
||||
static int lfc_max_size;
|
||||
static int lfc_size_limit;
|
||||
static int lfc_free_space_watermark;
|
||||
static bool lfc_disabled_by_failure = false;
|
||||
static char* lfc_path;
|
||||
static FileCacheControl* lfc_ctl;
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
@@ -97,8 +96,6 @@ static shmem_request_hook_type prev_shmem_request_hook;
|
||||
#endif
|
||||
static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */
|
||||
|
||||
#define DISABLE_LFC() (lfc_max_size = 0, lfc_disabled_by_failure = true, lfc_desc = -1)
|
||||
|
||||
void FileCacheMonitorMain(Datum main_arg);
|
||||
|
||||
static void
|
||||
@@ -171,7 +168,7 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
return;
|
||||
|
||||
/* Open cache file if not done yet */
|
||||
if (lfc_desc <= 0)
|
||||
if (lfc_desc == 0)
|
||||
{
|
||||
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
|
||||
if (lfc_desc < 0) {
|
||||
@@ -331,7 +328,7 @@ lfc_init(void)
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
if (lfc_max_size == 0 || lfc_disabled_by_failure)
|
||||
if (lfc_max_size == 0)
|
||||
return;
|
||||
|
||||
if (lfc_free_space_watermark != 0)
|
||||
@@ -360,7 +357,7 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
|
||||
bool found;
|
||||
uint32 hash;
|
||||
|
||||
if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return false;
|
||||
|
||||
tag.rnode = rnode;
|
||||
@@ -387,7 +384,7 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
|
||||
uint32 hash;
|
||||
|
||||
if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return;
|
||||
|
||||
INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||
@@ -458,7 +455,7 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
bool result = true;
|
||||
uint32 hash;
|
||||
|
||||
if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return false;
|
||||
|
||||
tag.rnode = rnode;
|
||||
@@ -480,25 +477,23 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
/* Open cache file if not done yet */
|
||||
if (lfc_desc <= 0)
|
||||
if (lfc_desc == 0)
|
||||
{
|
||||
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
|
||||
|
||||
if (lfc_desc < 0) {
|
||||
elog(LOG, "Failed to open file cache %s: %m", lfc_path);
|
||||
DISABLE_LFC();
|
||||
lfc_size_limit = 0; /* disable file cache */
|
||||
result = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (lfc_desc > 0)
|
||||
{
|
||||
rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
|
||||
if (rc != BLCKSZ)
|
||||
{
|
||||
elog(INFO, "Failed to read file cache: %m");
|
||||
DISABLE_LFC();
|
||||
lfc_size_limit = 0; /* disable file cache */
|
||||
result = false;
|
||||
}
|
||||
}
|
||||
@@ -528,7 +523,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
|
||||
uint32 hash;
|
||||
|
||||
if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return;
|
||||
|
||||
tag.rnode = rnode;
|
||||
@@ -575,12 +570,12 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
/* Open cache file if not done yet */
|
||||
if (lfc_desc <= 0)
|
||||
if (lfc_desc == 0)
|
||||
{
|
||||
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
|
||||
if (lfc_desc < 0) {
|
||||
elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
|
||||
DISABLE_LFC(); /* disable file cache */
|
||||
lfc_size_limit = 0; /* disable file cache */
|
||||
}
|
||||
}
|
||||
if (lfc_desc > 0)
|
||||
@@ -589,7 +584,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (rc != BLCKSZ)
|
||||
{
|
||||
elog(WARNING, "Failed to write file cache: %m, disabling file cache");
|
||||
DISABLE_LFC(); /* disable file cache */
|
||||
lfc_size_limit = 0; /* disable file cache */
|
||||
}
|
||||
}
|
||||
/* Place entry to the head of LRU list */
|
||||
|
||||
@@ -64,7 +64,7 @@ pub mod errors {
|
||||
}
|
||||
http::StatusCode::LOCKED => {
|
||||
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
|
||||
format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
|
||||
format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contract our support")
|
||||
}
|
||||
_ => REQUEST_FAILED.to_owned(),
|
||||
},
|
||||
|
||||
@@ -8,7 +8,6 @@ use super::{
|
||||
use crate::{auth::ClientCredentials, compute, http, scram};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use std::net::SocketAddr;
|
||||
use tokio::time::Instant;
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
@@ -118,7 +117,7 @@ impl Api {
|
||||
// We'll set username and such later using the startup message.
|
||||
// TODO: add more type safety (in progress).
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
|
||||
config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
|
||||
|
||||
let node = NodeInfo {
|
||||
config,
|
||||
@@ -195,9 +194,9 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
|
||||
Err(ApiError::Console { status, text })
|
||||
}
|
||||
|
||||
fn parse_host_port(input: &str) -> Option<(String, u16)> {
|
||||
let parsed: SocketAddr = input.parse().ok()?;
|
||||
Some((parsed.ip().to_string(), parsed.port()))
|
||||
fn parse_host_port(input: &str) -> Option<(&str, u16)> {
|
||||
let (host, port) = input.split_once(':')?;
|
||||
Some((host, port.parse().ok()?))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -22,10 +22,6 @@ serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
utils.workspace = true
|
||||
async-stream.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
futures-util.workspace = true
|
||||
itertools.workspace = true
|
||||
|
||||
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
|
||||
chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
|
||||
@@ -34,8 +30,10 @@ aws-config = { workspace = true, default-features = false, features = ["rustls",
|
||||
|
||||
pageserver = {path="../pageserver"}
|
||||
|
||||
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
clap.workspace = true
|
||||
tracing-appender = "0.2"
|
||||
histogram = "0.7"
|
||||
|
||||
atty = "0.2"
|
||||
tracing-appender = "0.2"
|
||||
@@ -4,12 +4,13 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_sdk_s3::Client;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
|
||||
use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectId};
|
||||
use crate::delete_batch_producer::DeleteProducerStats;
|
||||
use crate::{download_object_with_retries, list_objects_with_retries, RootTarget, MAX_RETRIES};
|
||||
use crate::{list_objects_with_retries, RootTarget, MAX_RETRIES};
|
||||
use pageserver::tenant::storage_layer::LayerFileName;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use utils::id::TenantTimelineId;
|
||||
@@ -91,9 +92,9 @@ pub async fn validate_pageserver_active_tenant_and_timelines(
|
||||
branch_checks.spawn(
|
||||
async move {
|
||||
let check_errors = branch_cleanup_and_check_errors(
|
||||
&id,
|
||||
id,
|
||||
&s3_root,
|
||||
Some(&s3_active_branch),
|
||||
&s3_active_branch,
|
||||
console_branch,
|
||||
s3_data,
|
||||
)
|
||||
@@ -106,13 +107,13 @@ pub async fn validate_pageserver_active_tenant_and_timelines(
|
||||
}
|
||||
|
||||
let mut total_stats = BranchCheckStats::default();
|
||||
while let Some((id, analysis)) = branch_checks
|
||||
while let Some((id, branch_check_errors)) = branch_checks
|
||||
.join_next()
|
||||
.await
|
||||
.transpose()
|
||||
.context("branch check task join")?
|
||||
{
|
||||
total_stats.add(id, analysis.errors);
|
||||
total_stats.add(id, branch_check_errors);
|
||||
}
|
||||
Ok(total_stats)
|
||||
}
|
||||
@@ -159,59 +160,35 @@ impl BranchCheckStats {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TimelineAnalysis {
|
||||
/// Anomalies detected
|
||||
pub errors: Vec<String>,
|
||||
|
||||
/// Healthy-but-noteworthy, like old-versioned structures that are readable but
|
||||
/// worth reporting for awareness that we must not remove that old version decoding
|
||||
/// yet.
|
||||
pub warnings: Vec<String>,
|
||||
|
||||
/// Keys not referenced in metadata: candidates for removal
|
||||
pub garbage_keys: Vec<String>,
|
||||
}
|
||||
|
||||
impl TimelineAnalysis {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
errors: Vec::new(),
|
||||
warnings: Vec::new(),
|
||||
garbage_keys: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn branch_cleanup_and_check_errors(
|
||||
id: &TenantTimelineId,
|
||||
async fn branch_cleanup_and_check_errors(
|
||||
id: TenantTimelineId,
|
||||
s3_root: &RootTarget,
|
||||
s3_active_branch: Option<&BranchData>,
|
||||
s3_active_branch: &BranchData,
|
||||
console_branch: Option<BranchData>,
|
||||
s3_data: Option<S3TimelineBlobData>,
|
||||
) -> TimelineAnalysis {
|
||||
let mut result = TimelineAnalysis::new();
|
||||
) -> Vec<String> {
|
||||
info!(
|
||||
"Checking timeline for branch branch {:?}/{:?}",
|
||||
s3_active_branch.project_id, s3_active_branch.id
|
||||
);
|
||||
let mut branch_check_errors = Vec::new();
|
||||
|
||||
info!("Checking timeline {id}");
|
||||
|
||||
if let Some(s3_active_branch) = s3_active_branch {
|
||||
info!(
|
||||
"Checking console status for timeline for branch {:?}/{:?}",
|
||||
s3_active_branch.project_id, s3_active_branch.id
|
||||
);
|
||||
match console_branch {
|
||||
Some(_) => {result.errors.push(format!("Timeline has deleted branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check",
|
||||
s3_active_branch.id, s3_active_branch.project_id))
|
||||
},
|
||||
None => {
|
||||
result.errors.push(format!("Timeline has no branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check",
|
||||
s3_active_branch.id, s3_active_branch.project_id))
|
||||
match console_branch {
|
||||
Some(console_active_branch) => {
|
||||
if console_active_branch.deleted {
|
||||
branch_check_errors.push(format!("Timeline has deleted branch data in the console (id = {:?}, project_id = {:?}), recheck whether if it got removed during the check",
|
||||
s3_active_branch.id, s3_active_branch.project_id))
|
||||
}
|
||||
};
|
||||
},
|
||||
None => branch_check_errors.push(format!("Timeline has no branch data in the console (id = {:?}, project_id = {:?}), recheck whether if it got removed during the check",
|
||||
s3_active_branch.id, s3_active_branch.project_id))
|
||||
}
|
||||
|
||||
let mut keys_to_remove = Vec::new();
|
||||
|
||||
match s3_data {
|
||||
Some(s3_data) => {
|
||||
result.garbage_keys.extend(s3_data.keys_to_remove);
|
||||
keys_to_remove.extend(s3_data.keys_to_remove);
|
||||
|
||||
match s3_data.blob_data {
|
||||
BlobDataParseResult::Parsed {
|
||||
@@ -219,23 +196,16 @@ pub async fn branch_cleanup_and_check_errors(
|
||||
mut s3_layers,
|
||||
} => {
|
||||
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
|
||||
result.errors.push(format!(
|
||||
branch_check_errors.push(format!(
|
||||
"index_part.json version: {}",
|
||||
index_part.get_version()
|
||||
))
|
||||
}
|
||||
|
||||
if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
|
||||
result.warnings.push(format!(
|
||||
"index_part.json version is not latest: {}",
|
||||
index_part.get_version()
|
||||
))
|
||||
}
|
||||
|
||||
if index_part.metadata.disk_consistent_lsn()
|
||||
!= index_part.get_disk_consistent_lsn()
|
||||
{
|
||||
result.errors.push(format!(
|
||||
branch_check_errors.push(format!(
|
||||
"Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})",
|
||||
index_part.metadata.disk_consistent_lsn(),
|
||||
index_part.get_disk_consistent_lsn(),
|
||||
@@ -250,13 +220,13 @@ pub async fn branch_cleanup_and_check_errors(
|
||||
|
||||
for (layer, metadata) in index_part.layer_metadata {
|
||||
if metadata.file_size == 0 {
|
||||
result.errors.push(format!(
|
||||
branch_check_errors.push(format!(
|
||||
"index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(),
|
||||
))
|
||||
}
|
||||
|
||||
if !s3_layers.remove(&layer) {
|
||||
result.errors.push(format!(
|
||||
branch_check_errors.push(format!(
|
||||
"index_part.json contains a layer {} that is not present in S3",
|
||||
layer.file_name(),
|
||||
))
|
||||
@@ -264,66 +234,55 @@ pub async fn branch_cleanup_and_check_errors(
|
||||
}
|
||||
|
||||
if !s3_layers.is_empty() {
|
||||
result.errors.push(format!(
|
||||
branch_check_errors.push(format!(
|
||||
"index_part.json does not contain layers from S3: {:?}",
|
||||
s3_layers
|
||||
.iter()
|
||||
.map(|layer_name| layer_name.file_name())
|
||||
.collect::<Vec<_>>(),
|
||||
));
|
||||
result
|
||||
.garbage_keys
|
||||
.extend(s3_layers.iter().map(|layer_name| {
|
||||
let mut key = s3_root.timeline_root(id).prefix_in_bucket;
|
||||
let delimiter = s3_root.delimiter();
|
||||
if !key.ends_with(delimiter) {
|
||||
key.push_str(delimiter);
|
||||
}
|
||||
key.push_str(&layer_name.file_name());
|
||||
key
|
||||
}));
|
||||
keys_to_remove.extend(s3_layers.iter().map(|layer_name| {
|
||||
let mut key = s3_root.timeline_root(id).prefix_in_bucket;
|
||||
let delimiter = s3_root.delimiter();
|
||||
if !key.ends_with(delimiter) {
|
||||
key.push_str(delimiter);
|
||||
}
|
||||
key.push_str(&layer_name.file_name());
|
||||
key
|
||||
}));
|
||||
}
|
||||
}
|
||||
BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
|
||||
BlobDataParseResult::Incorrect(parse_errors) => branch_check_errors.extend(
|
||||
parse_errors
|
||||
.into_iter()
|
||||
.map(|error| format!("parse error: {error}")),
|
||||
),
|
||||
}
|
||||
}
|
||||
None => result
|
||||
.errors
|
||||
.push("Timeline has no data on S3 at all".to_string()),
|
||||
None => branch_check_errors.push("Timeline has no data on S3 at all".to_string()),
|
||||
}
|
||||
|
||||
if result.errors.is_empty() {
|
||||
if branch_check_errors.is_empty() {
|
||||
info!("No check errors found");
|
||||
} else {
|
||||
warn!("Timeline metadata errors: {0:?}", result.errors);
|
||||
warn!("Found check errors: {branch_check_errors:?}");
|
||||
}
|
||||
|
||||
if !result.warnings.is_empty() {
|
||||
warn!("Timeline metadata warnings: {0:?}", result.warnings);
|
||||
if !keys_to_remove.is_empty() {
|
||||
error!("The following keys should be removed from S3: {keys_to_remove:?}")
|
||||
}
|
||||
|
||||
if !result.garbage_keys.is_empty() {
|
||||
error!(
|
||||
"The following keys should be removed from S3: {0:?}",
|
||||
result.garbage_keys
|
||||
)
|
||||
}
|
||||
|
||||
result
|
||||
branch_check_errors
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct S3TimelineBlobData {
|
||||
pub blob_data: BlobDataParseResult,
|
||||
pub keys_to_remove: Vec<String>,
|
||||
struct S3TimelineBlobData {
|
||||
blob_data: BlobDataParseResult,
|
||||
keys_to_remove: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum BlobDataParseResult {
|
||||
enum BlobDataParseResult {
|
||||
Parsed {
|
||||
index_part: IndexPart,
|
||||
s3_layers: HashSet<LayerFileName>,
|
||||
@@ -331,7 +290,7 @@ pub enum BlobDataParseResult {
|
||||
Incorrect(Vec<String>),
|
||||
}
|
||||
|
||||
pub async fn list_timeline_blobs(
|
||||
async fn list_timeline_blobs(
|
||||
s3_client: &Client,
|
||||
id: TenantTimelineId,
|
||||
s3_root: &RootTarget,
|
||||
@@ -339,7 +298,7 @@ pub async fn list_timeline_blobs(
|
||||
let mut s3_layers = HashSet::new();
|
||||
let mut index_part_object = None;
|
||||
|
||||
let timeline_dir_target = s3_root.timeline_root(&id);
|
||||
let timeline_dir_target = s3_root.timeline_root(id);
|
||||
let mut continuation_token = None;
|
||||
|
||||
let mut errors = Vec::new();
|
||||
@@ -435,3 +394,45 @@ pub async fn list_timeline_blobs(
|
||||
keys_to_remove,
|
||||
})
|
||||
}
|
||||
|
||||
async fn download_object_with_retries(
|
||||
s3_client: &Client,
|
||||
bucket_name: &str,
|
||||
key: &str,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
for _ in 0..MAX_RETRIES {
|
||||
let mut body_buf = Vec::new();
|
||||
let response_stream = match s3_client
|
||||
.get_object()
|
||||
.bucket(bucket_name)
|
||||
.key(key)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(response) => response,
|
||||
Err(e) => {
|
||||
error!("Failed to download object for key {key}: {e}");
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match response_stream
|
||||
.body
|
||||
.into_async_read()
|
||||
.read_to_end(&mut body_buf)
|
||||
.await
|
||||
{
|
||||
Ok(bytes_read) => {
|
||||
info!("Downloaded {bytes_read} bytes for object object with key {key}");
|
||||
return Ok(body_buf);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to stream object body for key {key}: {e}");
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
|
||||
}
|
||||
|
||||
@@ -41,7 +41,7 @@ pub async fn schedule_cleanup_deleted_timelines(
|
||||
|
||||
let new_stats = async move {
|
||||
let tenant_id_to_check = project_to_check.tenant;
|
||||
let check_target = check_root.timelines_root(&tenant_id_to_check);
|
||||
let check_target = check_root.timelines_root(tenant_id_to_check);
|
||||
let stats = super::process_s3_target_recursively(
|
||||
&check_s3_client,
|
||||
&check_target,
|
||||
|
||||
@@ -1,15 +1,12 @@
|
||||
pub mod checks;
|
||||
pub mod cloud_admin_api;
|
||||
pub mod delete_batch_producer;
|
||||
pub mod metadata_stream;
|
||||
mod s3_deletion;
|
||||
pub mod scan_metadata;
|
||||
|
||||
use std::env;
|
||||
use std::fmt::Display;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_config::environment::EnvironmentVariableCredentialsProvider;
|
||||
use aws_config::imds::credentials::ImdsCredentialsProvider;
|
||||
use aws_config::meta::credentials::CredentialsProviderChain;
|
||||
@@ -17,9 +14,7 @@ use aws_config::sso::SsoCredentialsProvider;
|
||||
use aws_sdk_s3::config::Region;
|
||||
use aws_sdk_s3::{Client, Config};
|
||||
|
||||
use reqwest::Url;
|
||||
pub use s3_deletion::S3Deleter;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::error;
|
||||
use tracing_appender::non_blocking::WorkerGuard;
|
||||
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
|
||||
@@ -28,8 +23,6 @@ use utils::id::{TenantId, TenantTimelineId};
|
||||
const MAX_RETRIES: usize = 20;
|
||||
const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
|
||||
|
||||
pub const CLI_NAME: &str = "s3-scrubber";
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct S3Target {
|
||||
pub bucket_name: String,
|
||||
@@ -76,19 +69,19 @@ impl RootTarget {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tenant_root(&self, tenant_id: &TenantId) -> S3Target {
|
||||
pub fn tenant_root(&self, tenant_id: TenantId) -> S3Target {
|
||||
self.tenants_root().with_sub_segment(&tenant_id.to_string())
|
||||
}
|
||||
|
||||
pub fn timelines_root(&self, tenant_id: &TenantId) -> S3Target {
|
||||
pub fn timelines_root(&self, tenant_id: TenantId) -> S3Target {
|
||||
match self {
|
||||
Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"),
|
||||
Self::Safekeeper(_) => self.tenant_root(tenant_id),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn timeline_root(&self, id: &TenantTimelineId) -> S3Target {
|
||||
self.timelines_root(&id.tenant_id)
|
||||
pub fn timeline_root(&self, id: TenantTimelineId) -> S3Target {
|
||||
self.timelines_root(id.tenant_id)
|
||||
.with_sub_segment(&id.timeline_id.to_string())
|
||||
}
|
||||
|
||||
@@ -107,55 +100,6 @@ impl RootTarget {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BucketConfig {
|
||||
pub region: String,
|
||||
pub bucket: String,
|
||||
|
||||
/// Use SSO if this is set, else rely on AWS_* environment vars
|
||||
pub sso_account_id: Option<String>,
|
||||
}
|
||||
|
||||
impl Display for BucketConfig {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}/{}/{}",
|
||||
self.sso_account_id.as_deref().unwrap_or("<none>"),
|
||||
self.region,
|
||||
self.bucket
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl BucketConfig {
|
||||
pub fn from_env() -> anyhow::Result<Self> {
|
||||
let sso_account_id = env::var("SSO_ACCOUNT_ID").ok();
|
||||
let region = env::var("REGION").context("'REGION' param retrieval")?;
|
||||
let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
|
||||
|
||||
Ok(Self {
|
||||
region,
|
||||
bucket,
|
||||
sso_account_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ConsoleConfig {
|
||||
pub admin_api_url: Url,
|
||||
}
|
||||
|
||||
impl ConsoleConfig {
|
||||
pub fn from_env() -> anyhow::Result<Self> {
|
||||
let admin_api_url: Url = env::var("CLOUD_ADMIN_API_URL")
|
||||
.context("'CLOUD_ADMIN_API_URL' param retrieval")?
|
||||
.parse()
|
||||
.context("'CLOUD_ADMIN_API_URL' param parsing")?;
|
||||
|
||||
Ok(Self { admin_api_url })
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_cloud_admin_api_token_or_exit() -> String {
|
||||
match env::var(CLOUD_ADMIN_API_TOKEN_ENV_VAR) {
|
||||
Ok(token) => token,
|
||||
@@ -170,7 +114,23 @@ pub fn get_cloud_admin_api_token_or_exit() -> String {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init_logging(file_name: &str) -> WorkerGuard {
|
||||
pub fn init_logging(binary_name: &str, dry_run: bool, node_kind: &str) -> WorkerGuard {
|
||||
let file_name = if dry_run {
|
||||
format!(
|
||||
"{}_{}_{}__dry.log",
|
||||
binary_name,
|
||||
node_kind,
|
||||
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"{}_{}_{}.log",
|
||||
binary_name,
|
||||
node_kind,
|
||||
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
|
||||
)
|
||||
};
|
||||
|
||||
let (file_writer, guard) =
|
||||
tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name));
|
||||
|
||||
@@ -180,6 +140,7 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
|
||||
.with_writer(file_writer);
|
||||
let stdout_logs = fmt::Layer::new()
|
||||
.with_target(false)
|
||||
.with_ansi(atty::is(atty::Stream::Stdout))
|
||||
.with_writer(std::io::stdout);
|
||||
tracing_subscriber::registry()
|
||||
.with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
|
||||
@@ -190,43 +151,30 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
|
||||
guard
|
||||
}
|
||||
|
||||
pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Client {
|
||||
pub fn init_s3_client(account_id: String, bucket_region: Region) -> Client {
|
||||
let credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
let chain = CredentialsProviderChain::first_try(
|
||||
"env",
|
||||
EnvironmentVariableCredentialsProvider::new(),
|
||||
);
|
||||
|
||||
// Use SSO if we were given an account ID
|
||||
match account_id {
|
||||
Some(sso_account) => chain.or_else(
|
||||
CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
|
||||
// uses sso
|
||||
.or_else(
|
||||
"sso",
|
||||
SsoCredentialsProvider::builder()
|
||||
.account_id(sso_account)
|
||||
.account_id(account_id)
|
||||
.role_name("PowerUserAccess")
|
||||
.start_url("https://neondb.awsapps.com/start")
|
||||
.region(Region::from_static("eu-central-1"))
|
||||
.build(),
|
||||
),
|
||||
None => chain,
|
||||
}
|
||||
.or_else(
|
||||
// Finally try IMDS
|
||||
"imds",
|
||||
ImdsCredentialsProvider::builder().build(),
|
||||
)
|
||||
)
|
||||
// uses imds v2
|
||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||
};
|
||||
|
||||
let mut builder = Config::builder()
|
||||
let config = Config::builder()
|
||||
.region(bucket_region)
|
||||
.credentials_provider(credentials_provider);
|
||||
.credentials_provider(credentials_provider)
|
||||
.build();
|
||||
|
||||
if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
|
||||
builder = builder.endpoint_url(endpoint)
|
||||
}
|
||||
|
||||
Client::from_conf(builder.build())
|
||||
Client::from_conf(config)
|
||||
}
|
||||
|
||||
async fn list_objects_with_retries(
|
||||
@@ -254,45 +202,3 @@ async fn list_objects_with_retries(
|
||||
|
||||
anyhow::bail!("Failed to list objects {MAX_RETRIES} times")
|
||||
}
|
||||
|
||||
async fn download_object_with_retries(
|
||||
s3_client: &Client,
|
||||
bucket_name: &str,
|
||||
key: &str,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
for _ in 0..MAX_RETRIES {
|
||||
let mut body_buf = Vec::new();
|
||||
let response_stream = match s3_client
|
||||
.get_object()
|
||||
.bucket(bucket_name)
|
||||
.key(key)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(response) => response,
|
||||
Err(e) => {
|
||||
error!("Failed to download object for key {key}: {e}");
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match response_stream
|
||||
.body
|
||||
.into_async_read()
|
||||
.read_to_end(&mut body_buf)
|
||||
.await
|
||||
{
|
||||
Ok(bytes_read) => {
|
||||
tracing::info!("Downloaded {bytes_read} bytes for object object with key {key}");
|
||||
return Ok(body_buf);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to stream object body for key {key}: {e}");
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
|
||||
}
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fmt::Display;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_sdk_s3::config::Region;
|
||||
use reqwest::Url;
|
||||
use s3_scrubber::cloud_admin_api::CloudAdminApiClient;
|
||||
use s3_scrubber::delete_batch_producer::DeleteBatchProducer;
|
||||
use s3_scrubber::scan_metadata::scan_metadata;
|
||||
use s3_scrubber::{
|
||||
checks, get_cloud_admin_api_token_or_exit, init_logging, init_s3_client, BucketConfig,
|
||||
ConsoleConfig, RootTarget, S3Deleter, S3Target, TraversingDepth, CLI_NAME,
|
||||
checks, get_cloud_admin_api_token_or_exit, init_logging, init_s3_client, RootTarget, S3Deleter,
|
||||
S3Target, TraversingDepth,
|
||||
};
|
||||
use tracing::{info, warn};
|
||||
use tracing::{info, info_span, warn};
|
||||
|
||||
use clap::{Parser, Subcommand, ValueEnum};
|
||||
|
||||
@@ -58,7 +59,48 @@ enum Command {
|
||||
#[arg(short, long, default_value_t = false)]
|
||||
skip_validation: bool,
|
||||
},
|
||||
ScanMetadata {},
|
||||
}
|
||||
|
||||
struct BucketConfig {
|
||||
region: String,
|
||||
bucket: String,
|
||||
sso_account_id: String,
|
||||
}
|
||||
|
||||
impl Display for BucketConfig {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}/{}/{}", self.sso_account_id, self.region, self.bucket)
|
||||
}
|
||||
}
|
||||
|
||||
impl BucketConfig {
|
||||
fn from_env() -> anyhow::Result<Self> {
|
||||
let sso_account_id =
|
||||
env::var("SSO_ACCOUNT_ID").context("'SSO_ACCOUNT_ID' param retrieval")?;
|
||||
let region = env::var("REGION").context("'REGION' param retrieval")?;
|
||||
let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
|
||||
|
||||
Ok(Self {
|
||||
region,
|
||||
bucket,
|
||||
sso_account_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct ConsoleConfig {
|
||||
admin_api_url: Url,
|
||||
}
|
||||
|
||||
impl ConsoleConfig {
|
||||
fn from_env() -> anyhow::Result<Self> {
|
||||
let admin_api_url: Url = env::var("CLOUD_ADMIN_API_URL")
|
||||
.context("'CLOUD_ADMIN_API_URL' param retrieval")?
|
||||
.parse()
|
||||
.context("'CLOUD_ADMIN_API_URL' param parsing")?;
|
||||
|
||||
Ok(Self { admin_api_url })
|
||||
}
|
||||
}
|
||||
|
||||
async fn tidy(
|
||||
@@ -69,24 +111,13 @@ async fn tidy(
|
||||
depth: TraversingDepth,
|
||||
skip_validation: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let dry_run = !cli.delete;
|
||||
let file_name = if dry_run {
|
||||
format!(
|
||||
"{}_{}_{}__dry.log",
|
||||
CLI_NAME,
|
||||
node_kind,
|
||||
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"{}_{}_{}.log",
|
||||
CLI_NAME,
|
||||
node_kind,
|
||||
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
|
||||
)
|
||||
};
|
||||
let binary_name = env::args()
|
||||
.next()
|
||||
.context("binary name in not the first argument")?;
|
||||
|
||||
let _guard = init_logging(&file_name);
|
||||
let dry_run = !cli.delete;
|
||||
let _guard = init_logging(&binary_name, dry_run, node_kind.as_str());
|
||||
let _main_span = info_span!("tidy", binary = %binary_name, %dry_run).entered();
|
||||
|
||||
if dry_run {
|
||||
info!("Dry run, not removing items for real");
|
||||
@@ -216,7 +247,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let bucket_config = BucketConfig::from_env()?;
|
||||
|
||||
match cli.command {
|
||||
match &cli.command {
|
||||
Command::Tidy {
|
||||
node_kind,
|
||||
depth,
|
||||
@@ -227,25 +258,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
&cli,
|
||||
bucket_config,
|
||||
console_config,
|
||||
node_kind,
|
||||
depth,
|
||||
skip_validation,
|
||||
*node_kind,
|
||||
*depth,
|
||||
*skip_validation,
|
||||
)
|
||||
.await
|
||||
}
|
||||
Command::ScanMetadata {} => match scan_metadata(bucket_config).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
Err(e)
|
||||
}
|
||||
Ok(summary) => {
|
||||
println!("{}", summary.summary_string());
|
||||
if summary.is_fatal() {
|
||||
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,106 +0,0 @@
|
||||
use anyhow::Context;
|
||||
use async_stream::{stream, try_stream};
|
||||
use aws_sdk_s3::Client;
|
||||
use tokio_stream::Stream;
|
||||
|
||||
use crate::{list_objects_with_retries, RootTarget, TenantId};
|
||||
use utils::id::{TenantTimelineId, TimelineId};
|
||||
|
||||
/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
|
||||
pub fn stream_tenants<'a>(
|
||||
s3_client: &'a Client,
|
||||
target: &'a RootTarget,
|
||||
) -> impl Stream<Item = anyhow::Result<TenantId>> + 'a {
|
||||
try_stream! {
|
||||
let mut continuation_token = None;
|
||||
loop {
|
||||
let tenants_target = target.tenants_root();
|
||||
let fetch_response =
|
||||
list_objects_with_retries(s3_client, tenants_target, continuation_token.clone()).await?;
|
||||
|
||||
let new_entry_ids = fetch_response
|
||||
.common_prefixes()
|
||||
.unwrap_or_default()
|
||||
.iter()
|
||||
.filter_map(|prefix| prefix.prefix())
|
||||
.filter_map(|prefix| -> Option<&str> {
|
||||
prefix
|
||||
.strip_prefix(&tenants_target.prefix_in_bucket)?
|
||||
.strip_suffix('/')
|
||||
}).map(|entry_id_str| {
|
||||
entry_id_str
|
||||
.parse()
|
||||
.with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
|
||||
});
|
||||
|
||||
for i in new_entry_ids {
|
||||
yield i?;
|
||||
}
|
||||
|
||||
match fetch_response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a TenantId, output a stream of the timelines within that tenant, discovered
|
||||
/// using ListObjectsv2. The listing is done before the stream is built, so that this
|
||||
/// function can be used to generate concurrency on a stream using buffer_unordered.
|
||||
pub async fn stream_tenant_timelines<'a>(
|
||||
s3_client: &'a Client,
|
||||
target: &'a RootTarget,
|
||||
tenant: TenantId,
|
||||
) -> anyhow::Result<impl Stream<Item = Result<TenantTimelineId, anyhow::Error>> + 'a> {
|
||||
let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
|
||||
let mut continuation_token = None;
|
||||
let timelines_target = target.timelines_root(&tenant);
|
||||
|
||||
loop {
|
||||
tracing::info!("Listing in {}", tenant);
|
||||
let fetch_response =
|
||||
list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
|
||||
.await;
|
||||
let fetch_response = match fetch_response {
|
||||
Err(e) => {
|
||||
timeline_ids.push(Err(e));
|
||||
break;
|
||||
}
|
||||
Ok(r) => r,
|
||||
};
|
||||
|
||||
let new_entry_ids = fetch_response
|
||||
.common_prefixes()
|
||||
.unwrap_or_default()
|
||||
.iter()
|
||||
.filter_map(|prefix| prefix.prefix())
|
||||
.filter_map(|prefix| -> Option<&str> {
|
||||
prefix
|
||||
.strip_prefix(&timelines_target.prefix_in_bucket)?
|
||||
.strip_suffix('/')
|
||||
})
|
||||
.map(|entry_id_str| {
|
||||
entry_id_str
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
|
||||
});
|
||||
|
||||
for i in new_entry_ids {
|
||||
timeline_ids.push(i);
|
||||
}
|
||||
|
||||
match fetch_response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Yielding for {}", tenant);
|
||||
Ok(stream! {
|
||||
for i in timeline_ids {
|
||||
let id = i?;
|
||||
yield Ok(TenantTimelineId::new(tenant, id));
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -164,7 +164,7 @@ async fn delete_tenants_batch(
|
||||
s3_target,
|
||||
s3_client,
|
||||
dry_run,
|
||||
|root_target, tenant_to_delete| root_target.tenant_root(&tenant_to_delete),
|
||||
|root_target, tenant_to_delete| root_target.tenant_root(tenant_to_delete),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -215,7 +215,7 @@ async fn delete_timelines_batch(
|
||||
s3_target,
|
||||
s3_client,
|
||||
dry_run,
|
||||
|root_target, timeline_to_delete| root_target.timeline_root(&timeline_to_delete),
|
||||
|root_target, timeline_to_delete| root_target.timeline_root(timeline_to_delete),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -386,7 +386,7 @@ async fn ensure_tenant_batch_deleted(
|
||||
|
||||
for &tenant_id in batch {
|
||||
let fetch_response =
|
||||
list_objects_with_retries(s3_client, &s3_target.tenant_root(&tenant_id), None).await?;
|
||||
list_objects_with_retries(s3_client, &s3_target.tenant_root(tenant_id), None).await?;
|
||||
|
||||
if fetch_response.is_truncated()
|
||||
|| fetch_response.contents().is_some()
|
||||
@@ -415,7 +415,7 @@ async fn ensure_timeline_batch_deleted(
|
||||
|
||||
for &id in batch {
|
||||
let fetch_response =
|
||||
list_objects_with_retries(s3_client, &s3_target.timeline_root(&id), None).await?;
|
||||
list_objects_with_retries(s3_client, &s3_target.timeline_root(id), None).await?;
|
||||
|
||||
if fetch_response.is_truncated()
|
||||
|| fetch_response.contents().is_some()
|
||||
|
||||
@@ -1,234 +0,0 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::checks::{
|
||||
branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
|
||||
TimelineAnalysis,
|
||||
};
|
||||
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
|
||||
use crate::{init_logging, init_s3_client, BucketConfig, RootTarget, S3Target, CLI_NAME};
|
||||
use aws_sdk_s3::Client;
|
||||
use aws_types::region::Region;
|
||||
use futures_util::{pin_mut, StreamExt, TryStreamExt};
|
||||
use histogram::Histogram;
|
||||
use pageserver::tenant::{IndexPart, TENANTS_SEGMENT_NAME};
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
pub struct MetadataSummary {
|
||||
count: usize,
|
||||
with_errors: HashSet<TenantTimelineId>,
|
||||
with_warnings: HashSet<TenantTimelineId>,
|
||||
with_garbage: HashSet<TenantTimelineId>,
|
||||
indices_by_version: HashMap<usize, usize>,
|
||||
|
||||
layer_count: MinMaxHisto,
|
||||
timeline_size_bytes: MinMaxHisto,
|
||||
layer_size_bytes: MinMaxHisto,
|
||||
}
|
||||
|
||||
/// A histogram plus minimum and maximum tracking
|
||||
struct MinMaxHisto {
|
||||
histo: Histogram,
|
||||
min: u64,
|
||||
max: u64,
|
||||
}
|
||||
|
||||
impl MinMaxHisto {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
histo: histogram::Histogram::builder()
|
||||
.build()
|
||||
.expect("Bad histogram params"),
|
||||
min: u64::MAX,
|
||||
max: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn sample(&mut self, v: u64) -> Result<(), histogram::Error> {
|
||||
self.min = std::cmp::min(self.min, v);
|
||||
self.max = std::cmp::max(self.max, v);
|
||||
let r = self.histo.increment(v, 1);
|
||||
|
||||
if r.is_err() {
|
||||
tracing::warn!("Bad histogram sample: {v}");
|
||||
}
|
||||
|
||||
r
|
||||
}
|
||||
|
||||
fn oneline(&self) -> String {
|
||||
let percentiles = match self.histo.percentiles(&[1.0, 10.0, 50.0, 90.0, 99.0]) {
|
||||
Ok(p) => p,
|
||||
Err(e) => return format!("No data: {}", e),
|
||||
};
|
||||
|
||||
let percentiles: Vec<u64> = percentiles
|
||||
.iter()
|
||||
.map(|p| p.bucket().low() + p.bucket().high() / 2)
|
||||
.collect();
|
||||
|
||||
format!(
|
||||
"min {}, 1% {}, 10% {}, 50% {}, 90% {}, 99% {}, max {}",
|
||||
self.min,
|
||||
percentiles[0],
|
||||
percentiles[1],
|
||||
percentiles[2],
|
||||
percentiles[3],
|
||||
percentiles[4],
|
||||
self.max,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl MetadataSummary {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
count: 0,
|
||||
with_errors: HashSet::new(),
|
||||
with_warnings: HashSet::new(),
|
||||
with_garbage: HashSet::new(),
|
||||
indices_by_version: HashMap::new(),
|
||||
layer_count: MinMaxHisto::new(),
|
||||
timeline_size_bytes: MinMaxHisto::new(),
|
||||
layer_size_bytes: MinMaxHisto::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn update_histograms(&mut self, index_part: &IndexPart) -> Result<(), histogram::Error> {
|
||||
self.layer_count
|
||||
.sample(index_part.layer_metadata.len() as u64)?;
|
||||
let mut total_size: u64 = 0;
|
||||
for meta in index_part.layer_metadata.values() {
|
||||
total_size += meta.file_size;
|
||||
self.layer_size_bytes.sample(meta.file_size)?;
|
||||
}
|
||||
self.timeline_size_bytes.sample(total_size)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_data(&mut self, data: &S3TimelineBlobData) {
|
||||
self.count += 1;
|
||||
if let BlobDataParseResult::Parsed {
|
||||
index_part,
|
||||
s3_layers: _,
|
||||
} = &data.blob_data
|
||||
{
|
||||
*self
|
||||
.indices_by_version
|
||||
.entry(index_part.get_version())
|
||||
.or_insert(0) += 1;
|
||||
|
||||
if let Err(e) = self.update_histograms(index_part) {
|
||||
// Value out of range? Warn that the results are untrustworthy
|
||||
tracing::warn!(
|
||||
"Error updating histograms, summary stats may be wrong: {}",
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn update_analysis(&mut self, id: &TenantTimelineId, analysis: &TimelineAnalysis) {
|
||||
if !analysis.errors.is_empty() {
|
||||
self.with_errors.insert(*id);
|
||||
}
|
||||
|
||||
if !analysis.warnings.is_empty() {
|
||||
self.with_warnings.insert(*id);
|
||||
}
|
||||
}
|
||||
|
||||
/// Long-form output for printing at end of a scan
|
||||
pub fn summary_string(&self) -> String {
|
||||
let version_summary: String = itertools::join(
|
||||
self.indices_by_version
|
||||
.iter()
|
||||
.map(|(k, v)| format!("{k}: {v}")),
|
||||
", ",
|
||||
);
|
||||
|
||||
format!(
|
||||
"Timelines: {0}
|
||||
With errors: {1}
|
||||
With warnings: {2}
|
||||
With garbage: {3}
|
||||
Index versions: {version_summary}
|
||||
Timeline size bytes: {4}
|
||||
Layer size bytes: {5}
|
||||
Timeline layer count: {6}
|
||||
",
|
||||
self.count,
|
||||
self.with_errors.len(),
|
||||
self.with_warnings.len(),
|
||||
self.with_garbage.len(),
|
||||
self.timeline_size_bytes.oneline(),
|
||||
self.layer_size_bytes.oneline(),
|
||||
self.layer_count.oneline(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn is_fatal(&self) -> bool {
|
||||
!self.with_errors.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
|
||||
pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<MetadataSummary> {
|
||||
let file_name = format!(
|
||||
"{}_scan_metadata_{}_{}.log",
|
||||
CLI_NAME,
|
||||
bucket_config.bucket,
|
||||
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
|
||||
);
|
||||
|
||||
let _guard = init_logging(&file_name);
|
||||
|
||||
let s3_client = Arc::new(init_s3_client(
|
||||
bucket_config.sso_account_id,
|
||||
Region::new(bucket_config.region),
|
||||
));
|
||||
let delimiter = "/";
|
||||
let target = RootTarget::Pageserver(S3Target {
|
||||
bucket_name: bucket_config.bucket.to_string(),
|
||||
prefix_in_bucket: ["pageserver", "v1", TENANTS_SEGMENT_NAME, ""].join(delimiter),
|
||||
delimiter: delimiter.to_string(),
|
||||
});
|
||||
|
||||
let tenants = stream_tenants(&s3_client, &target);
|
||||
|
||||
// How many tenants to process in parallel. We need to be mindful of pageservers
|
||||
// accessing the same per tenant prefixes, so use a lower setting than pageservers.
|
||||
const CONCURRENCY: usize = 32;
|
||||
|
||||
// Generate a stream of TenantTimelineId
|
||||
let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
|
||||
let timelines = timelines.try_buffer_unordered(CONCURRENCY);
|
||||
let timelines = timelines.try_flatten();
|
||||
|
||||
// Generate a stream of S3TimelineBlobData
|
||||
async fn report_on_timeline(
|
||||
s3_client: &Client,
|
||||
target: &RootTarget,
|
||||
ttid: TenantTimelineId,
|
||||
) -> anyhow::Result<(TenantTimelineId, S3TimelineBlobData)> {
|
||||
let data = list_timeline_blobs(s3_client, ttid, target).await?;
|
||||
Ok((ttid, data))
|
||||
}
|
||||
let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
|
||||
let timelines = timelines.try_buffer_unordered(CONCURRENCY);
|
||||
|
||||
let mut summary = MetadataSummary::new();
|
||||
pin_mut!(timelines);
|
||||
while let Some(i) = timelines.next().await {
|
||||
let (ttid, data) = i?;
|
||||
summary.update_data(&data);
|
||||
|
||||
let analysis =
|
||||
branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data)).await;
|
||||
|
||||
summary.update_analysis(&ttid, &analysis);
|
||||
}
|
||||
|
||||
Ok(summary)
|
||||
}
|
||||
@@ -18,10 +18,6 @@
|
||||
// reportUrl: "...",
|
||||
// reportJsonUrl: "...",
|
||||
// },
|
||||
// coverage: {
|
||||
// coverageUrl: "...",
|
||||
// summaryJsonUrl: "...",
|
||||
// }
|
||||
// })
|
||||
//
|
||||
|
||||
@@ -187,24 +183,7 @@ const reportSummary = async (params) => {
|
||||
return summary
|
||||
}
|
||||
|
||||
const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {
|
||||
let summary = `### Code coverage [full report](${coverageUrl})\n`
|
||||
|
||||
const coverage = await (await fetch(summaryJsonUrl)).json()
|
||||
for (const covType of Object.keys(coverage).sort()) {
|
||||
if (!coverage.hasOwnProperty(covType)) {
|
||||
continue
|
||||
}
|
||||
|
||||
summary += `- \`${covType}s\`: \`${coverage[covType]["_summary"]}\`\n`
|
||||
}
|
||||
|
||||
summary += `\n___\n`
|
||||
|
||||
return summary
|
||||
}
|
||||
|
||||
module.exports = async ({ github, context, fetch, report, coverage }) => {
|
||||
module.exports = async ({ github, context, fetch, report }) => {
|
||||
// Marker to find the comment in the subsequent runs
|
||||
const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
|
||||
// If we run the script in the PR or in the branch (main/release/...)
|
||||
@@ -225,6 +204,7 @@ module.exports = async ({ github, context, fetch, report, coverage }) => {
|
||||
}
|
||||
|
||||
const {reportUrl, reportJsonUrl} = report
|
||||
|
||||
if (reportUrl && reportJsonUrl) {
|
||||
try {
|
||||
const parsed = await parseReportJson({ reportJsonUrl, fetch })
|
||||
@@ -243,22 +223,6 @@ module.exports = async ({ github, context, fetch, report, coverage }) => {
|
||||
} else {
|
||||
commentBody += `#### No tests were run or test report is not available\n`
|
||||
}
|
||||
|
||||
const { coverageUrl, summaryJsonUrl } = coverage
|
||||
if (coverageUrl && summaryJsonUrl) {
|
||||
try {
|
||||
commentBody += await parseCoverageSummary({ summaryJsonUrl, coverageUrl, fetch })
|
||||
} catch (error) {
|
||||
commentBody += `### [full report](${coverageUrl})\n___\n`
|
||||
commentBody += `#### Failed to create a coverage summary for the test run: \n`
|
||||
commentBody += "```\n"
|
||||
commentBody += `${error.stack}\n`
|
||||
commentBody += "```\n"
|
||||
}
|
||||
} else {
|
||||
commentBody += `#### Test coverage report is not avaibale\n`
|
||||
}
|
||||
|
||||
commentBody += autoupdateNotice
|
||||
|
||||
let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha
|
||||
|
||||
@@ -414,8 +414,6 @@ class NeonEnvBuilder:
|
||||
neon_binpath: Path,
|
||||
pg_distrib_dir: Path,
|
||||
pg_version: PgVersion,
|
||||
test_name: str,
|
||||
test_output_dir: Path,
|
||||
remote_storage: Optional[RemoteStorage] = None,
|
||||
remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
|
||||
pageserver_config_override: Optional[str] = None,
|
||||
@@ -456,14 +454,6 @@ class NeonEnvBuilder:
|
||||
self.preserve_database_files = preserve_database_files
|
||||
self.initial_tenant = initial_tenant or TenantId.generate()
|
||||
self.initial_timeline = initial_timeline or TimelineId.generate()
|
||||
self.enable_generations = False
|
||||
self.scrub_on_exit = False
|
||||
self.test_output_dir = test_output_dir
|
||||
|
||||
assert test_name.startswith(
|
||||
"test_"
|
||||
), "Unexpectedly instantiated from outside a test function"
|
||||
self.test_name = test_name
|
||||
|
||||
def init_configs(self) -> NeonEnv:
|
||||
# Cannot create more than one environment from one builder
|
||||
@@ -493,44 +483,26 @@ class NeonEnvBuilder:
|
||||
|
||||
return env
|
||||
|
||||
def enable_scrub_on_exit(self):
|
||||
"""
|
||||
Call this if you would like the fixture to automatically run
|
||||
s3_scrubber at the end of the test, as a bidirectional test
|
||||
that the scrubber is working properly, and that the code within
|
||||
the test didn't produce any invalid remote state.
|
||||
"""
|
||||
|
||||
if not isinstance(self.remote_storage, S3Storage):
|
||||
# The scrubber can't talk to e.g. LocalFS -- it needs
|
||||
# an HTTP endpoint (mock is fine) to connect to.
|
||||
raise RuntimeError(
|
||||
"Cannot scrub with remote_storage={self.remote_storage}, require an S3 endpoint"
|
||||
)
|
||||
|
||||
self.scrub_on_exit = True
|
||||
|
||||
def enable_remote_storage(
|
||||
self,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
test_name: str,
|
||||
force_enable: bool = True,
|
||||
enable_remote_extensions: bool = False,
|
||||
):
|
||||
bucket_name = re.sub(r"[_\[\]]", "-", self.test_name)[:63]
|
||||
|
||||
if remote_storage_kind == RemoteStorageKind.NOOP:
|
||||
return
|
||||
elif remote_storage_kind == RemoteStorageKind.LOCAL_FS:
|
||||
self.enable_local_fs_remote_storage(force_enable=force_enable)
|
||||
elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
|
||||
self.enable_mock_s3_remote_storage(
|
||||
bucket_name=bucket_name,
|
||||
bucket_name=test_name,
|
||||
force_enable=force_enable,
|
||||
enable_remote_extensions=enable_remote_extensions,
|
||||
)
|
||||
elif remote_storage_kind == RemoteStorageKind.REAL_S3:
|
||||
self.enable_real_s3_remote_storage(
|
||||
test_name=bucket_name,
|
||||
test_name=test_name,
|
||||
force_enable=force_enable,
|
||||
enable_remote_extensions=enable_remote_extensions,
|
||||
)
|
||||
@@ -741,24 +713,12 @@ class NeonEnvBuilder:
|
||||
sk.stop(immediate=True)
|
||||
self.env.pageserver.stop(immediate=True)
|
||||
|
||||
if self.env.attachment_service is not None:
|
||||
self.env.attachment_service.stop(immediate=True)
|
||||
|
||||
cleanup_error = None
|
||||
|
||||
if self.scrub_on_exit:
|
||||
try:
|
||||
S3Scrubber(self.test_output_dir, self).scan_metadata()
|
||||
except Exception as e:
|
||||
log.error(f"Error during remote storage scrub: {e}")
|
||||
cleanup_error = e
|
||||
|
||||
try:
|
||||
self.cleanup_remote_storage()
|
||||
except Exception as e:
|
||||
log.error(f"Error during remote storage cleanup: {e}")
|
||||
if cleanup_error is not None:
|
||||
cleanup_error = e
|
||||
cleanup_error = e
|
||||
|
||||
try:
|
||||
self.cleanup_local_storage()
|
||||
@@ -806,8 +766,6 @@ class NeonEnv:
|
||||
the tenant id
|
||||
"""
|
||||
|
||||
PAGESERVER_ID = 1
|
||||
|
||||
def __init__(self, config: NeonEnvBuilder):
|
||||
self.repo_dir = config.repo_dir
|
||||
self.rust_log_override = config.rust_log_override
|
||||
@@ -831,14 +789,6 @@ class NeonEnv:
|
||||
self.initial_tenant = config.initial_tenant
|
||||
self.initial_timeline = config.initial_timeline
|
||||
|
||||
if config.enable_generations:
|
||||
attachment_service_port = self.port_distributor.get_port()
|
||||
self.control_plane_api: Optional[str] = f"http://127.0.0.1:{attachment_service_port}"
|
||||
self.attachment_service: Optional[NeonAttachmentService] = NeonAttachmentService(self)
|
||||
else:
|
||||
self.control_plane_api = None
|
||||
self.attachment_service = None
|
||||
|
||||
# Create a config file corresponding to the options
|
||||
toml = textwrap.dedent(
|
||||
f"""
|
||||
@@ -864,7 +814,7 @@ class NeonEnv:
|
||||
toml += textwrap.dedent(
|
||||
f"""
|
||||
[pageserver]
|
||||
id={self.PAGESERVER_ID}
|
||||
id=1
|
||||
listen_pg_addr = 'localhost:{pageserver_port.pg}'
|
||||
listen_http_addr = 'localhost:{pageserver_port.http}'
|
||||
pg_auth_type = '{pg_auth_type}'
|
||||
@@ -872,13 +822,6 @@ class NeonEnv:
|
||||
"""
|
||||
)
|
||||
|
||||
if self.control_plane_api is not None:
|
||||
toml += textwrap.dedent(
|
||||
f"""
|
||||
control_plane_api = '{self.control_plane_api}'
|
||||
"""
|
||||
)
|
||||
|
||||
# Create a corresponding NeonPageserver object
|
||||
self.pageserver = NeonPageserver(
|
||||
self, port=pageserver_port, config_override=config.pageserver_config_override
|
||||
@@ -925,9 +868,6 @@ class NeonEnv:
|
||||
def start(self):
|
||||
# Start up broker, pageserver and all safekeepers
|
||||
self.broker.try_start()
|
||||
|
||||
if self.attachment_service is not None:
|
||||
self.attachment_service.start()
|
||||
self.pageserver.start()
|
||||
|
||||
for safekeeper in self.safekeepers:
|
||||
@@ -982,7 +922,6 @@ def _shared_simple_env(
|
||||
default_broker: NeonBroker,
|
||||
run_id: uuid.UUID,
|
||||
top_output_dir: Path,
|
||||
test_output_dir: Path,
|
||||
neon_binpath: Path,
|
||||
pg_distrib_dir: Path,
|
||||
pg_version: PgVersion,
|
||||
@@ -1010,8 +949,6 @@ def _shared_simple_env(
|
||||
pg_version=pg_version,
|
||||
run_id=run_id,
|
||||
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
) as builder:
|
||||
env = builder.init_start()
|
||||
|
||||
@@ -1039,7 +976,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:
|
||||
@pytest.fixture(scope="function")
|
||||
def neon_env_builder(
|
||||
pytestconfig: Config,
|
||||
test_output_dir: Path,
|
||||
test_output_dir: str,
|
||||
port_distributor: PortDistributor,
|
||||
mock_s3_server: MockS3Server,
|
||||
neon_binpath: Path,
|
||||
@@ -1047,7 +984,6 @@ def neon_env_builder(
|
||||
pg_version: PgVersion,
|
||||
default_broker: NeonBroker,
|
||||
run_id: uuid.UUID,
|
||||
request: FixtureRequest,
|
||||
) -> Iterator[NeonEnvBuilder]:
|
||||
"""
|
||||
Fixture to create a Neon environment for test.
|
||||
@@ -1076,8 +1012,6 @@ def neon_env_builder(
|
||||
broker=default_broker,
|
||||
run_id=run_id,
|
||||
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
) as builder:
|
||||
yield builder
|
||||
|
||||
@@ -1355,16 +1289,6 @@ class NeonCli(AbstractNeonCli):
|
||||
res.check_returncode()
|
||||
return res
|
||||
|
||||
def attachment_service_start(self):
|
||||
cmd = ["attachment_service", "start"]
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def attachment_service_stop(self, immediate: bool):
|
||||
cmd = ["attachment_service", "stop"]
|
||||
if immediate:
|
||||
cmd.extend(["-m", "immediate"])
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def pageserver_start(
|
||||
self,
|
||||
overrides: Tuple[str, ...] = (),
|
||||
@@ -1546,35 +1470,6 @@ class ComputeCtl(AbstractNeonCli):
|
||||
COMMAND = "compute_ctl"
|
||||
|
||||
|
||||
class NeonAttachmentService:
|
||||
def __init__(self, env: NeonEnv):
|
||||
self.env = env
|
||||
self.running = False
|
||||
|
||||
def start(self):
|
||||
assert not self.running
|
||||
self.env.neon_cli.attachment_service_start()
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
def stop(self, immediate: bool = False) -> "NeonAttachmentService":
|
||||
if self.running:
|
||||
self.env.neon_cli.attachment_service_stop(immediate)
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
def __enter__(self) -> "NeonAttachmentService":
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: Optional[Type[BaseException]],
|
||||
exc: Optional[BaseException],
|
||||
tb: Optional[TracebackType],
|
||||
):
|
||||
self.stop(immediate=True)
|
||||
|
||||
|
||||
class NeonPageserver(PgProtocol):
|
||||
"""
|
||||
An object representing a running pageserver.
|
||||
@@ -1738,26 +1633,6 @@ class NeonPageserver(PgProtocol):
|
||||
|
||||
return None
|
||||
|
||||
def tenant_attach(
|
||||
self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False
|
||||
):
|
||||
"""
|
||||
Tenant attachment passes through here to acquire a generation number before proceeding
|
||||
to call into the pageserver HTTP client.
|
||||
"""
|
||||
if self.env.attachment_service is not None:
|
||||
response = requests.post(
|
||||
f"{self.env.control_plane_api}/attach_hook",
|
||||
json={"tenant_id": str(tenant_id), "pageserver_id": self.env.PAGESERVER_ID},
|
||||
)
|
||||
response.raise_for_status()
|
||||
generation = response.json()["gen"]
|
||||
else:
|
||||
generation = None
|
||||
|
||||
client = self.env.pageserver.http_client()
|
||||
return client.tenant_attach(tenant_id, config, config_null, generation=generation)
|
||||
|
||||
|
||||
def append_pageserver_param_overrides(
|
||||
params_to_update: List[str],
|
||||
@@ -1843,10 +1718,7 @@ class PgBin:
|
||||
self._fixpath(command)
|
||||
log.info(f"Running command '{' '.join(command)}'")
|
||||
env = self._build_env(env)
|
||||
base_path, _, _ = subprocess_capture(
|
||||
self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs
|
||||
)
|
||||
return base_path
|
||||
return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
@@ -2852,41 +2724,6 @@ class SafekeeperHttpClient(requests.Session):
|
||||
return metrics
|
||||
|
||||
|
||||
class S3Scrubber:
|
||||
def __init__(self, log_dir: Path, env: NeonEnvBuilder):
|
||||
self.env = env
|
||||
self.log_dir = log_dir
|
||||
|
||||
def scrubber_cli(self, args, timeout):
|
||||
assert isinstance(self.env.remote_storage, S3Storage)
|
||||
s3_storage = self.env.remote_storage
|
||||
|
||||
env = {
|
||||
"REGION": s3_storage.bucket_region,
|
||||
"BUCKET": s3_storage.bucket_name,
|
||||
}
|
||||
env.update(s3_storage.access_env_vars())
|
||||
|
||||
if s3_storage.endpoint is not None:
|
||||
env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
|
||||
|
||||
base_args = [self.env.neon_binpath / "s3_scrubber"]
|
||||
args = base_args + args
|
||||
|
||||
(output_path, _, status_code) = subprocess_capture(
|
||||
self.log_dir, args, echo_stderr=True, echo_stdout=True, env=env, check=False
|
||||
)
|
||||
if status_code:
|
||||
log.warning(f"Scrub command {args} failed")
|
||||
log.warning(f"Scrub environment: {env}")
|
||||
log.warning(f"Output at: {output_path}")
|
||||
|
||||
raise RuntimeError("Remote storage scrub failed")
|
||||
|
||||
def scan_metadata(self):
|
||||
self.scrubber_cli(["scan-metadata"], timeout=30)
|
||||
|
||||
|
||||
def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
|
||||
"""Compute the working directory for an individual test."""
|
||||
test_name = request.node.name
|
||||
|
||||
@@ -186,25 +186,18 @@ class PageserverHttpClient(requests.Session):
|
||||
return TenantId(new_tenant_id)
|
||||
|
||||
def tenant_attach(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
config: None | Dict[str, Any] = None,
|
||||
config_null: bool = False,
|
||||
generation: Optional[int] = None,
|
||||
self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False
|
||||
):
|
||||
if config_null:
|
||||
assert config is None
|
||||
body: Any = None
|
||||
body = "null"
|
||||
else:
|
||||
# null-config is prohibited by the API
|
||||
config = config or {}
|
||||
body = {"config": config}
|
||||
if generation is not None:
|
||||
body.update({"generation": generation})
|
||||
|
||||
body = json.dumps({"config": config})
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach",
|
||||
data=json.dumps(body),
|
||||
data=body,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
@@ -88,19 +88,6 @@ def available_s3_storages() -> List[RemoteStorageKind]:
|
||||
return remote_storages
|
||||
|
||||
|
||||
def s3_storage() -> RemoteStorageKind:
|
||||
"""
|
||||
For tests that require a remote storage impl that exposes an S3
|
||||
endpoint, but don't want to parametrize over multiple storage types.
|
||||
|
||||
Use real S3 if available, else use MockS3
|
||||
"""
|
||||
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
|
||||
return RemoteStorageKind.REAL_S3
|
||||
else:
|
||||
return RemoteStorageKind.MOCK_S3
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalFsStorage:
|
||||
root: Path
|
||||
|
||||
@@ -4,10 +4,9 @@ import os
|
||||
import re
|
||||
import subprocess
|
||||
import tarfile
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, TypeVar
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, TypeVar
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import allure
|
||||
@@ -27,100 +26,34 @@ def get_self_dir() -> Path:
|
||||
return Path(__file__).resolve().parent
|
||||
|
||||
|
||||
def subprocess_capture(
|
||||
capture_dir: Path,
|
||||
cmd: List[str],
|
||||
*,
|
||||
check=False,
|
||||
echo_stderr=False,
|
||||
echo_stdout=False,
|
||||
capture_stdout=False,
|
||||
**kwargs: Any,
|
||||
) -> Tuple[str, Optional[str], int]:
|
||||
"""Run a process and bifurcate its output to files and the `log` logger
|
||||
def subprocess_capture(capture_dir: Path, cmd: List[str], **kwargs: Any) -> str:
|
||||
"""Run a process and capture its output
|
||||
|
||||
stderr and stdout are always captured in files. They are also optionally
|
||||
echoed to the log (echo_stderr, echo_stdout), and/or captured and returned
|
||||
(capture_stdout).
|
||||
|
||||
File output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
|
||||
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
|
||||
where "cmd" is the name of the program and NNN is an incrementing
|
||||
counter.
|
||||
|
||||
If those files already exist, we will overwrite them.
|
||||
|
||||
Returns 3-tuple of:
|
||||
- The base path for output files
|
||||
- Captured stdout, or None
|
||||
- The exit status of the process
|
||||
Returns basepath for files with captured output.
|
||||
"""
|
||||
assert isinstance(cmd, list)
|
||||
base_cmd = os.path.basename(cmd[0])
|
||||
base = f"{base_cmd}_{global_counter()}"
|
||||
base = f"{os.path.basename(cmd[0])}_{global_counter()}"
|
||||
basepath = os.path.join(capture_dir, base)
|
||||
stdout_filename = f"{basepath}.stdout"
|
||||
stderr_filename = f"{basepath}.stderr"
|
||||
|
||||
# Since we will stream stdout and stderr concurrently, need to do it in a thread.
|
||||
class OutputHandler(threading.Thread):
|
||||
def __init__(self, in_file, out_file, echo: bool, capture: bool):
|
||||
super().__init__()
|
||||
self.in_file = in_file
|
||||
self.out_file = out_file
|
||||
self.echo = echo
|
||||
self.capture = capture
|
||||
self.captured = ""
|
||||
|
||||
def run(self):
|
||||
for line in self.in_file:
|
||||
# Only bother decoding if we are going to do something more than stream to a file
|
||||
if self.echo or self.capture:
|
||||
string = line.decode(encoding="utf-8", errors="replace")
|
||||
|
||||
if self.echo:
|
||||
log.info(string)
|
||||
|
||||
if self.capture:
|
||||
self.captured += string
|
||||
|
||||
self.out_file.write(line)
|
||||
|
||||
captured = None
|
||||
try:
|
||||
with open(stdout_filename, "wb") as stdout_f:
|
||||
with open(stderr_filename, "wb") as stderr_f:
|
||||
with open(stdout_filename, "w") as stdout_f:
|
||||
with open(stderr_filename, "w") as stderr_f:
|
||||
log.info(f'Capturing stdout to "{base}.stdout" and stderr to "{base}.stderr"')
|
||||
|
||||
p = subprocess.Popen(
|
||||
cmd,
|
||||
**kwargs,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
stdout_handler = OutputHandler(
|
||||
p.stdout, stdout_f, echo=echo_stdout, capture=capture_stdout
|
||||
)
|
||||
stdout_handler.start()
|
||||
stderr_handler = OutputHandler(p.stderr, stderr_f, echo=echo_stderr, capture=False)
|
||||
stderr_handler.start()
|
||||
|
||||
r = p.wait()
|
||||
|
||||
stdout_handler.join()
|
||||
stderr_handler.join()
|
||||
|
||||
if check and r != 0:
|
||||
raise subprocess.CalledProcessError(r, " ".join(cmd))
|
||||
|
||||
if capture_stdout:
|
||||
captured = stdout_handler.captured
|
||||
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
||||
finally:
|
||||
# Remove empty files if there is no output
|
||||
for filename in (stdout_filename, stderr_filename):
|
||||
if os.stat(filename).st_size == 0:
|
||||
os.remove(filename)
|
||||
|
||||
return (basepath, captured, r)
|
||||
return basepath
|
||||
|
||||
|
||||
_global_counter = 0
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Tuple
|
||||
from typing import Dict, Tuple
|
||||
|
||||
import pytest
|
||||
from _pytest.mark import ParameterSet
|
||||
@@ -79,15 +78,6 @@ QUERIES: Tuple[LabelledQuery, ...] = (
|
||||
)
|
||||
|
||||
|
||||
def get_scale() -> List[str]:
|
||||
# We parametrize each tpc-h and clickbench test with scale
|
||||
# to distinguish them from each other, but don't really use it inside.
|
||||
# Databases are pre-created and passed through BENCHMARK_CONNSTR env variable.
|
||||
|
||||
scale = os.getenv("TEST_OLAP_SCALE", "noscale")
|
||||
return [scale]
|
||||
|
||||
|
||||
def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> None:
|
||||
# prepare connstr:
|
||||
# - cut out password from connstr to pass it via env
|
||||
@@ -110,10 +100,9 @@ def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> N
|
||||
env.pg_bin.run_capture(["psql", connstr, "-c", query], env=environ)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("scale", get_scale())
|
||||
@pytest.mark.parametrize("query", QUERIES)
|
||||
@pytest.mark.remote_cluster
|
||||
def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale: str):
|
||||
def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare):
|
||||
"""
|
||||
An OLAP-style ClickHouse benchmark
|
||||
|
||||
@@ -139,10 +128,9 @@ def tpch_queuies() -> Tuple[ParameterSet, ...]:
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("scale", get_scale())
|
||||
@pytest.mark.parametrize("query", tpch_queuies())
|
||||
@pytest.mark.remote_cluster
|
||||
def test_tpch(query: LabelledQuery, remote_compare: RemoteCompare, scale: str):
|
||||
def test_tpch(query: LabelledQuery, remote_compare: RemoteCompare):
|
||||
"""
|
||||
TCP-H Benchmark
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ from fixtures.utils import wait_until
|
||||
def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
|
||||
test_name="test_attach_tenant_config",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -38,6 +39,7 @@ class NegativeTests:
|
||||
def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, None, None]:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
|
||||
test_name="test_attach_tenant_config",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
assert isinstance(env.remote_storage, LocalFsStorage)
|
||||
|
||||
@@ -135,7 +135,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
|
||||
|
||||
log.info(f"setting up eviction_env for test {request.node.name}")
|
||||
|
||||
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
|
||||
|
||||
# initial tenant will not be present on this pageserver
|
||||
env = neon_env_builder.init_configs()
|
||||
|
||||
@@ -90,6 +90,7 @@ def test_remote_extensions(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_extensions",
|
||||
enable_remote_extensions=True,
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -156,6 +157,7 @@ def test_remote_library(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_library",
|
||||
enable_remote_extensions=True,
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -216,6 +218,7 @@ def test_multiple_extensions_one_archive(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.REAL_S3,
|
||||
test_name="test_multiple_extensions_one_archive",
|
||||
enable_remote_extensions=True,
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -263,6 +266,7 @@ def test_extension_download_after_restart(
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
test_name="test_extension_download_after_restart",
|
||||
enable_remote_extensions=True,
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -102,6 +102,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_gc_index_upload",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -21,6 +21,7 @@ def test_basic_eviction(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_download_remote_layers_api",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
@@ -156,6 +157,7 @@ def test_basic_eviction(
|
||||
def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
|
||||
test_name="test_gc_of_remote_layers",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -96,6 +96,7 @@ def test_metric_collection(
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_metric_collection",
|
||||
)
|
||||
|
||||
log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
|
||||
|
||||
@@ -54,6 +54,7 @@ def test_ondemand_download_large_rel(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ondemand_download_large_rel",
|
||||
)
|
||||
|
||||
# thinking about using a shared environment? the test assumes that global
|
||||
@@ -156,6 +157,7 @@ def test_ondemand_download_timetravel(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ondemand_download_timetravel",
|
||||
)
|
||||
|
||||
# thinking about using a shared environment? the test assumes that global
|
||||
@@ -317,6 +319,7 @@ def test_download_remote_layers_api(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_download_remote_layers_api",
|
||||
)
|
||||
|
||||
##### First start, insert data and upload it to the remote storage
|
||||
@@ -478,6 +481,7 @@ def test_compaction_downloads_on_demand_without_image_creation(
|
||||
"""
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_compaction_downloads_on_demand_without_image_creation",
|
||||
)
|
||||
|
||||
conf = {
|
||||
@@ -565,6 +569,7 @@ def test_compaction_downloads_on_demand_with_image_creation(
|
||||
"""
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_compaction_downloads_on_demand",
|
||||
)
|
||||
|
||||
conf = {
|
||||
@@ -665,6 +670,7 @@ def test_ondemand_download_failure_to_replace(
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ondemand_download_failure_to_replace",
|
||||
)
|
||||
|
||||
# disable gc and compaction via default tenant config because config is lost while detaching
|
||||
|
||||
@@ -3,17 +3,11 @@ from contextlib import closing
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.remote_storage import s3_storage
|
||||
|
||||
|
||||
# Test restarting page server, while safekeeper and compute node keep
|
||||
# running.
|
||||
@pytest.mark.parametrize("generations", [True, False])
|
||||
def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool):
|
||||
neon_env_builder.enable_generations = generations
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind=s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
|
||||
def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.neon_cli.create_branch("test_pageserver_restart")
|
||||
@@ -115,9 +109,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
|
||||
# safekeeper and compute node keep running.
|
||||
@pytest.mark.timeout(540)
|
||||
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind=s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Use a tiny checkpoint distance, to create a lot of layers quickly.
|
||||
|
||||
@@ -52,9 +52,9 @@ from requests import ReadTimeout
|
||||
#
|
||||
# The tests are done for all types of remote storage pageserver supports.
|
||||
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
|
||||
@pytest.mark.parametrize("generations", [True, False])
|
||||
def test_remote_storage_backup_and_restore(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, generations: bool
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
# Use this test to check more realistic SK ids: some etcd key parsing bugs were related,
|
||||
# and this test needs SK to write data to pageserver, so it will be visible
|
||||
@@ -62,10 +62,9 @@ def test_remote_storage_backup_and_restore(
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_storage_backup_and_restore",
|
||||
)
|
||||
|
||||
neon_env_builder.enable_generations = generations
|
||||
|
||||
# Exercise retry code path by making all uploads and downloads fail for the
|
||||
# first time. The retries print INFO-messages to the log; we will check
|
||||
# that they are present after the test.
|
||||
@@ -156,7 +155,7 @@ def test_remote_storage_backup_and_restore(
|
||||
# background task to load the tenant. In that background task,
|
||||
# listing the remote timelines will fail because of the failpoint,
|
||||
# and the tenant will be marked as Broken.
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15)
|
||||
assert tenant_info["attachment_status"] == {
|
||||
@@ -166,7 +165,7 @@ def test_remote_storage_backup_and_restore(
|
||||
|
||||
# Ensure that even though the tenant is broken, we can't attach it again.
|
||||
with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"):
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
# Restart again, this implicitly clears the failpoint.
|
||||
# test_remote_failures=1 remains active, though, as it's in the pageserver config.
|
||||
@@ -184,7 +183,7 @@ def test_remote_storage_backup_and_restore(
|
||||
# Ensure that the pageserver remembers that the tenant was attaching, by
|
||||
# trying to attach it again. It should fail.
|
||||
with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"):
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
client.tenant_attach(tenant_id)
|
||||
log.info("waiting for tenant to become active. this should be quick with on-demand download")
|
||||
|
||||
wait_until_tenant_active(
|
||||
@@ -226,6 +225,7 @@ def test_remote_storage_upload_queue_retries(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_storage_upload_queue_retries",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -364,7 +364,7 @@ def test_remote_storage_upload_queue_retries(
|
||||
env.pageserver.start()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
wait_until_tenant_active(client, tenant_id)
|
||||
|
||||
@@ -381,6 +381,7 @@ def test_remote_timeline_client_calls_started_metric(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_timeline_client_metrics",
|
||||
)
|
||||
|
||||
# thinking about using a shared environment? the test assumes that global
|
||||
@@ -501,7 +502,7 @@ def test_remote_timeline_client_calls_started_metric(
|
||||
env.pageserver.start()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
wait_until_tenant_active(client, tenant_id)
|
||||
|
||||
@@ -523,6 +524,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_timeline_deletion_with_files_stuck_in_upload_queue",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
@@ -640,6 +642,7 @@ def test_empty_branch_remote_storage_upload(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_empty_branch_remote_storage_upload",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -691,6 +694,7 @@ def test_empty_branch_remote_storage_upload_on_restart(
|
||||
"""
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_empty_branch_remote_storage_upload_on_restart",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -788,6 +792,7 @@ def test_compaction_delete_before_upload(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_compaction_delete_before_upload",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
|
||||
@@ -294,6 +294,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
|
||||
def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
|
||||
test_name="test_creating_tenant_conf_after_attach",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -338,6 +339,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
|
||||
test_name="test_live_reconfig_get_evictions_low_residence_duration_metric_threshold",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -43,6 +43,7 @@ def test_tenant_delete_smoke(
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_tenant_delete_smoke",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -176,7 +177,9 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
if simulate_failures:
|
||||
neon_env_builder.pageserver_config_override = "test_remote_failures=1"
|
||||
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind)
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
|
||||
|
||||
@@ -189,7 +192,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
# allow errors caused by failpoints
|
||||
f".*failpoint: {failpoint}",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# We may leave some upload tasks in the queue. They're likely deletes.
|
||||
# For uploads we explicitly wait with `last_flush_lsn_upload` below.
|
||||
# So by ignoring these instead of waiting for empty upload queue
|
||||
@@ -297,6 +300,7 @@ def test_tenant_delete_is_resumed_on_attach(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_deleted_tenant_ignored_on_attach",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
|
||||
@@ -334,7 +338,7 @@ def test_tenant_delete_is_resumed_on_attach(
|
||||
# From deletion polling
|
||||
f".*NotFound: tenant {env.initial_tenant}.*",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# error from http response is also logged
|
||||
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
|
||||
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
|
||||
|
||||
@@ -46,6 +46,7 @@ def test_tenant_reattach(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_tenant_reattach",
|
||||
)
|
||||
|
||||
# Exercise retry code path by making all uploads and downloads fail for the
|
||||
@@ -230,6 +231,7 @@ def test_tenant_reattach_while_busy(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_tenant_reattach_while_busy",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -451,6 +453,7 @@ def test_detach_while_attaching(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_detach_while_attaching",
|
||||
)
|
||||
|
||||
##### First start, insert secret data and upload it to the remote storage
|
||||
@@ -534,6 +537,7 @@ def test_ignored_tenant_reattach(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ignored_tenant_reattach",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
@@ -605,6 +609,7 @@ def test_ignored_tenant_download_missing_layers(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ignored_tenant_download_and_attach",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
@@ -670,6 +675,7 @@ def test_ignored_tenant_stays_broken_without_metadata(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ignored_tenant_stays_broken_without_metadata",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
@@ -713,6 +719,7 @@ def test_load_attach_negatives(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_load_attach_negatives",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
@@ -757,6 +764,7 @@ def test_ignore_while_attaching(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ignore_while_attaching",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -860,6 +868,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_metrics_while_ignoring_broken_tenant_and_reloading",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -526,6 +526,7 @@ def test_emergency_relocate_with_branches_slow_replay(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_emergency_relocate_with_branches_slow_replay",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -682,6 +683,7 @@ def test_emergency_relocate_with_branches_createdb(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_emergency_relocate_with_branches_createdb",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -244,6 +244,7 @@ def test_pageserver_metrics_removed_after_detach(
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_pageserver_metrics_removed_after_detach",
|
||||
)
|
||||
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
@@ -304,6 +305,7 @@ def test_pageserver_with_empty_tenants(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_pageserver_with_empty_tenants",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -64,6 +64,7 @@ async def all_tenants_workload(env: NeonEnv, tenants_endpoints):
|
||||
def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_tenants_many",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -116,6 +117,7 @@ def test_tenants_attached_after_download(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="remote_storage_kind",
|
||||
)
|
||||
|
||||
data_id = 1
|
||||
@@ -230,6 +232,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
# since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it.
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_tenant_redownloads_truncated_file_on_startup",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -16,12 +16,13 @@ from pytest_httpserver import HTTPServer
|
||||
|
||||
|
||||
def test_threshold_based_eviction(
|
||||
request,
|
||||
httpserver: HTTPServer,
|
||||
httpserver_listen_address,
|
||||
pg_bin: PgBin,
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
|
||||
|
||||
# Start with metrics collection enabled, so that the eviction task
|
||||
# imitates its accesses. We'll use a non-existent endpoint to make it fail.
|
||||
|
||||
@@ -191,7 +191,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
8. Retry or restart without the failpoint and check the result.
|
||||
"""
|
||||
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind)
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
@@ -229,7 +231,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
|
||||
# It appears when we stopped flush loop during deletion and then pageserver is stopped
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
|
||||
)
|
||||
# This happens when we fail before scheduling background operation.
|
||||
# Timeline is left in stopping state and retry tries to stop it again.
|
||||
@@ -348,6 +350,7 @@ def test_timeline_resurrection_on_attach(
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_timeline_resurrection_on_attach",
|
||||
)
|
||||
|
||||
##### First start, insert data and upload it to the remote storage
|
||||
@@ -435,6 +438,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
test_name="test_timeline_delete_fail_before_local_delete",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -445,7 +449,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
|
||||
)
|
||||
# this happens, because the stuck timeline is visible to shutdown
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*freeze_and_flush_on_shutdown.+: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
|
||||
)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
@@ -554,6 +558,7 @@ def test_concurrent_timeline_delete_stuck_on(
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
test_name=f"concurrent_timeline_delete_stuck_on_{stuck_failpoint}",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -631,6 +636,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
test_name="test_delete_timeline_client_hangup",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -700,6 +706,7 @@ def test_timeline_delete_works_for_remote_smoke(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_timeline_delete_works_for_remote_smoke",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -773,7 +780,7 @@ def test_delete_orphaned_objects(
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
remote_storage_kind = RemoteStorageKind.LOCAL_FS
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind)
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind, "test_delete_orphaned_objects")
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
@@ -837,6 +844,7 @@ def test_timeline_delete_resumed_on_attach(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_deleted_tenant_ignored_on_attach",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
|
||||
@@ -873,7 +881,7 @@ def test_timeline_delete_resumed_on_attach(
|
||||
# allow errors caused by failpoints
|
||||
f".*failpoint: {failpoint}",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# error from http response is also logged
|
||||
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
|
||||
# Polling after attach may fail with this
|
||||
|
||||
@@ -306,7 +306,9 @@ def test_timeline_physical_size_init(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind)
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -347,7 +349,9 @@ def test_timeline_physical_size_post_checkpoint(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind)
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -378,7 +382,9 @@ def test_timeline_physical_size_post_compaction(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind)
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
# Disable background compaction as we don't want it to happen after `get_physical_size` request
|
||||
# and before checking the expected size on disk, which makes the assertion failed
|
||||
@@ -431,7 +437,9 @@ def test_timeline_physical_size_post_gc(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind)
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
# Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
|
||||
# and before checking the expected size on disk, which makes the assertion failed
|
||||
@@ -564,7 +572,9 @@ def test_tenant_physical_size(
|
||||
random.seed(100)
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind)
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
|
||||
@@ -439,6 +439,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_safekeepers_wal_backup",
|
||||
)
|
||||
|
||||
neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER
|
||||
@@ -490,6 +491,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_s3_wal_replay",
|
||||
)
|
||||
|
||||
neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER
|
||||
|
||||
Reference in New Issue
Block a user