mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-16 04:30:38 +00:00
Compare commits
89 Commits
vlad/fix-v
...
bayandin/t
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d31e272919 | ||
|
|
ebea319d64 | ||
|
|
a26cc29d92 | ||
|
|
5f2f31e879 | ||
|
|
938b163b42 | ||
|
|
5cbf5b45ae | ||
|
|
af5c54ed14 | ||
|
|
523cf71721 | ||
|
|
c47f355ec1 | ||
|
|
4f67b0225b | ||
|
|
2f7cecaf6a | ||
|
|
589594c2e1 | ||
|
|
70fe007519 | ||
|
|
b224a5a377 | ||
|
|
a65d437930 | ||
|
|
fc67f8dc60 | ||
|
|
2b65a2b53e | ||
|
|
9490360df4 | ||
|
|
91d947654e | ||
|
|
37aa6fd953 | ||
|
|
3ad567290c | ||
|
|
3a110e45ed | ||
|
|
e7e6319e20 | ||
|
|
d865881d59 | ||
|
|
1c5d6e59a0 | ||
|
|
263dfba6ee | ||
|
|
df3996265f | ||
|
|
29699529df | ||
|
|
f446e08fb8 | ||
|
|
4d5add9ca0 | ||
|
|
59b4c2eaf9 | ||
|
|
5432155b0d | ||
|
|
e16e82749f | ||
|
|
9f653893b9 | ||
|
|
913af44219 | ||
|
|
ecd615ab6d | ||
|
|
c9b2ec9ff1 | ||
|
|
a3800dcb0c | ||
|
|
9a32aa828d | ||
|
|
f03f7b3868 | ||
|
|
ec5dce04eb | ||
|
|
6014f15157 | ||
|
|
e675a21346 | ||
|
|
6b93230270 | ||
|
|
797aa4ffaa | ||
|
|
c45b56e0bb | ||
|
|
3104f0f250 | ||
|
|
f2c08195f0 | ||
|
|
d0cbfda15c | ||
|
|
1708743e78 | ||
|
|
0a1ca7670c | ||
|
|
ff9f065c43 | ||
|
|
21eeafaaa5 | ||
|
|
32a0e759bd | ||
|
|
7c489092b7 | ||
|
|
06d55a3b12 | ||
|
|
5c68e6a172 | ||
|
|
2753abc0d8 | ||
|
|
a523548ed1 | ||
|
|
2d4e5af18b | ||
|
|
5da2340e74 | ||
|
|
7b34c2d7af | ||
|
|
15ae1fc3df | ||
|
|
728b79b9dd | ||
|
|
9d1c6f23d3 | ||
|
|
035a49a6b2 | ||
|
|
794bd4b866 | ||
|
|
ac6a1151ae | ||
|
|
2f37f0384c | ||
|
|
e161a2fa42 | ||
|
|
c5cd8577ff | ||
|
|
3454ef7507 | ||
|
|
135e7e4306 | ||
|
|
3cd2a3f931 | ||
|
|
d78f5ce6da | ||
|
|
a1b71b73fe | ||
|
|
6138eb50e9 | ||
|
|
d211f00f05 | ||
|
|
cd4276fd65 | ||
|
|
b719d58863 | ||
|
|
2db840d8b8 | ||
|
|
4295ff0f07 | ||
|
|
c6f56b8462 | ||
|
|
fec9321fc0 | ||
|
|
3a52e356c1 | ||
|
|
5e16c7bb0b | ||
|
|
2bbb4d3e1c | ||
|
|
c8bedca582 | ||
|
|
5876c441ab |
@@ -13,6 +13,7 @@
|
||||
# Directories
|
||||
!.cargo/
|
||||
!.config/
|
||||
!compute/
|
||||
!compute_tools/
|
||||
!control_plane/
|
||||
!libs/
|
||||
|
||||
2
.github/workflows/_push-to-acr.yml
vendored
2
.github/workflows/_push-to-acr.yml
vendored
@@ -52,5 +52,5 @@ jobs:
|
||||
for image in ${images}; do
|
||||
docker buildx imagetools create \
|
||||
-t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
|
||||
neondatabase/${image}:${{ inputs.image_tag }}
|
||||
neondatabase/${image}:${{ inputs.image_tag }}
|
||||
done
|
||||
|
||||
@@ -36,7 +36,7 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
arch: [ x64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
|
||||
@@ -79,7 +79,9 @@ jobs:
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.build-tools
|
||||
cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}
|
||||
cache-from: |
|
||||
type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}
|
||||
type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}-does-not-exist-2
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
|
||||
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
|
||||
|
||||
|
||||
107
.github/workflows/build_and_test.yml
vendored
107
.github/workflows/build_and_test.yml
vendored
@@ -6,6 +6,7 @@ on:
|
||||
- main
|
||||
- release
|
||||
- release-proxy
|
||||
- bayandin/test
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
@@ -27,7 +28,7 @@ env:
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
if: false
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name }}
|
||||
@@ -54,8 +55,8 @@ jobs:
|
||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
# Need `fetch-depth: 0` to count the number of commits in the branch
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -78,7 +79,6 @@ jobs:
|
||||
id: build-tag
|
||||
|
||||
check-build-tools-image:
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
build-build-tools-image:
|
||||
@@ -120,6 +120,59 @@ jobs:
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
|
||||
# Check that the vendor/postgres-* submodules point to the
|
||||
# corresponding REL_*_STABLE_neon branches.
|
||||
check-submodules:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- uses: dorny/paths-filter@v3
|
||||
id: check-if-submodules-changed
|
||||
with:
|
||||
filters: |
|
||||
vendor:
|
||||
- 'vendor/**'
|
||||
|
||||
- name: Check vendor/postgres-v14 submodule reference
|
||||
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
|
||||
uses: jtmullen/submodule-branch-check-action@v1
|
||||
with:
|
||||
path: "vendor/postgres-v14"
|
||||
fetch_depth: "50"
|
||||
sub_fetch_depth: "50"
|
||||
pass_if_unchanged: true
|
||||
|
||||
- name: Check vendor/postgres-v15 submodule reference
|
||||
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
|
||||
uses: jtmullen/submodule-branch-check-action@v1
|
||||
with:
|
||||
path: "vendor/postgres-v15"
|
||||
fetch_depth: "50"
|
||||
sub_fetch_depth: "50"
|
||||
pass_if_unchanged: true
|
||||
|
||||
- name: Check vendor/postgres-v16 submodule reference
|
||||
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
|
||||
uses: jtmullen/submodule-branch-check-action@v1
|
||||
with:
|
||||
path: "vendor/postgres-v16"
|
||||
fetch_depth: "50"
|
||||
sub_fetch_depth: "50"
|
||||
pass_if_unchanged: true
|
||||
|
||||
- name: Check vendor/postgres-v17 submodule reference
|
||||
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
|
||||
uses: jtmullen/submodule-branch-check-action@v1
|
||||
with:
|
||||
path: "vendor/postgres-v17"
|
||||
fetch_depth: "50"
|
||||
sub_fetch_depth: "50"
|
||||
pass_if_unchanged: true
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
strategy:
|
||||
@@ -159,6 +212,10 @@ jobs:
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
#
|
||||
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
|
||||
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
|
||||
# time just for that, so skip "clippy --release".
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
@@ -168,8 +225,6 @@ jobs:
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
- name: Run cargo clippy (release)
|
||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
@@ -357,6 +412,7 @@ jobs:
|
||||
})
|
||||
|
||||
coverage-report:
|
||||
if: ${{ !startsWith(github.ref_name, 'release') }}
|
||||
needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
@@ -373,8 +429,8 @@ jobs:
|
||||
coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
|
||||
coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
# Need `fetch-depth: 0` for differential coverage (to get diff between two commits)
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
@@ -475,11 +531,9 @@ jobs:
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
@@ -554,11 +608,9 @@ jobs:
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
@@ -599,7 +651,7 @@ jobs:
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
file: compute/Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
tags: |
|
||||
@@ -618,7 +670,7 @@ jobs:
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
file: compute/Dockerfile.compute-node
|
||||
target: neon-pg-ext-test
|
||||
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
@@ -639,7 +691,7 @@ jobs:
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
file: compute/Dockerfile.compute-node
|
||||
tags: |
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
@@ -705,10 +757,7 @@ jobs:
|
||||
VM_BUILDER_VERSION: v0.29.3
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Downloading vm-builder
|
||||
run: |
|
||||
@@ -730,7 +779,7 @@ jobs:
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder \
|
||||
-spec=vm-image-spec.yaml \
|
||||
-spec=compute/vm-image-spec.yaml \
|
||||
-src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
@@ -748,10 +797,7 @@ jobs:
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: docker/login-action@v3
|
||||
@@ -957,6 +1003,7 @@ jobs:
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
|
||||
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
|
||||
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
|
||||
|
||||
runs-on: [ self-hosted, small ]
|
||||
@@ -976,10 +1023,7 @@ jobs:
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Trigger deploy workflow
|
||||
env:
|
||||
@@ -1058,7 +1102,8 @@ jobs:
|
||||
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
|
||||
promote-compatibility-data:
|
||||
needs: [ deploy ]
|
||||
if: github.ref_name == 'release'
|
||||
# `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
|
||||
if: github.ref_name == 'release' && !failure() && !cancelled()
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
|
||||
@@ -33,7 +33,7 @@ jobs:
|
||||
IMAGE_TAG: |
|
||||
${{ hashFiles('Dockerfile.build-tools',
|
||||
'.github/workflows/check-build-tools-image.yml',
|
||||
'.github/workflows/build-build-tools-image.yml') }}
|
||||
'.github/workflows/build-build-tools-image.yml') }}-test
|
||||
run: |
|
||||
echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
|
||||
102
.github/workflows/cloud-regress.yml
vendored
Normal file
102
.github/workflows/cloud-regress.yml
vendored
Normal file
@@ -0,0 +1,102 @@
|
||||
name: Cloud Regression Test
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '45 1 * * *' # run once a day, timezone is utc
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow
|
||||
group: ${{ github.workflow }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
regress:
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
runs-on: us-east-2
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Patch the test
|
||||
run: |
|
||||
cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
|
||||
patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
|
||||
|
||||
- name: Generate a random password
|
||||
id: pwgen
|
||||
run: |
|
||||
set +x
|
||||
DBPASS=$(dd if=/dev/random bs=48 count=1 2>/dev/null | base64)
|
||||
echo "::add-mask::${DBPASS//\//}"
|
||||
echo DBPASS="${DBPASS//\//}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Change tests according to the generated password
|
||||
env:
|
||||
DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
|
||||
run: |
|
||||
cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
|
||||
for fname in sql/*.sql expected/*.out; do
|
||||
sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
|
||||
done
|
||||
for ph in $(grep NEON_MD5_PLACEHOLDER expected/password.out | awk '{print $3;}' | sort | uniq); do
|
||||
USER=$(echo "${ph}" | cut -c 22-)
|
||||
MD5=md5$(echo -n "${DBPASS}${USER}" | md5sum | awk '{print $1;}')
|
||||
sed -i.bak "s/${ph}/${MD5}/" expected/password.out
|
||||
done
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Run the regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: cloud_regress
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
extra_params: -m remote_cluster
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # on-call-staging-stream
|
||||
slack-message: |
|
||||
Periodic pg_regress on staging: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
6
.github/workflows/trigger-e2e-tests.yml
vendored
6
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -34,8 +34,8 @@ jobs:
|
||||
build-tag: ${{ steps.build-tag.outputs.tag }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
# Need `fetch-depth: 0` to count the number of commits in the branch
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -107,7 +107,7 @@ jobs:
|
||||
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
|
||||
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
|
||||
case "$f" in
|
||||
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
|
||||
vendor/*|pgxn/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
;;
|
||||
*)
|
||||
|
||||
451
Cargo.lock
generated
451
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -76,8 +76,6 @@ clap = { version = "4.0", features = ["derive"] }
|
||||
comfy-table = "7.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-deque = "0.8.5"
|
||||
crossbeam-utils = "0.8.5"
|
||||
dashmap = { version = "5.5.0", features = ["raw-api"] }
|
||||
either = "1.8"
|
||||
enum-map = "2.4.2"
|
||||
@@ -95,7 +93,7 @@ hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
hostname = "0.3.1"
|
||||
hostname = "0.4"
|
||||
http = {version = "1.1.0", features = ["std"]}
|
||||
http-types = { version = "2", default-features = false }
|
||||
humantime = "2.1"
|
||||
@@ -104,7 +102,6 @@ hyper = "0.14"
|
||||
tokio-tungstenite = "0.20.0"
|
||||
indexmap = "2"
|
||||
indoc = "2"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "9"
|
||||
@@ -113,7 +110,7 @@ libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
memoffset = "0.8"
|
||||
memoffset = "0.9"
|
||||
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
|
||||
notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
@@ -142,7 +139,6 @@ rpds = "0.13"
|
||||
rustc-hash = "1.1.0"
|
||||
rustls = "0.22"
|
||||
rustls-pemfile = "2"
|
||||
rustls-split = "0.3"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
sd-notify = "0.4.1"
|
||||
@@ -164,7 +160,6 @@ strum_macros = "0.26"
|
||||
svg_fmt = "0.4.3"
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
task-local-extensions = "0.1.4"
|
||||
test-context = "0.3"
|
||||
thiserror = "1.0"
|
||||
tikv-jemallocator = "0.5"
|
||||
|
||||
@@ -55,22 +55,27 @@ RUN cd postgres && \
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
||||
# the first loop is for pg_stat_statement extension version <= 1.6
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if echo "$old_list" | grep -q -F "$filename"; then \
|
||||
# Note that there are no downgrade scripts for pg_stat_statements, so we \
|
||||
# don't have to modify any downgrade paths or (much) older versions: we only \
|
||||
# have to make sure every creation of the pg_stat_statements_reset function \
|
||||
# also adds execute permissions to the neon_superuser.
|
||||
case $filename in \
|
||||
pg_stat_statements--1.4.sql) \
|
||||
# pg_stat_statements_reset is first created with 1.4
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# where pg_stat_statement_reset() got 3 additional arguments
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
||||
;; \
|
||||
pg_stat_statements--1.6--1.7.sql) \
|
||||
# Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done
|
||||
;; \
|
||||
pg_stat_statements--1.10--1.11.sql) \
|
||||
# Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \
|
||||
;; \
|
||||
esac; \
|
||||
done;
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -275,7 +280,7 @@ FROM build-deps AS vector-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY patches/pgvector.patch /pgvector.patch
|
||||
COPY compute/patches/pgvector.patch /pgvector.patch
|
||||
|
||||
# By default, pgvector Makefile uses `-march=native`. We don't want that,
|
||||
# because we build the images on different machines than where we run them.
|
||||
@@ -361,7 +366,7 @@ FROM build-deps AS rum-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY patches/rum.patch /rum.patch
|
||||
COPY compute/patches/rum.patch /rum.patch
|
||||
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
@@ -1026,6 +1031,41 @@ FROM debian:bullseye-slim AS compute-tools-image
|
||||
|
||||
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgbouncer"
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM debian:bullseye-slim AS pgbouncer
|
||||
RUN set -e \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y \
|
||||
build-essential \
|
||||
git \
|
||||
libevent-dev \
|
||||
libtool \
|
||||
pkg-config
|
||||
|
||||
# Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
|
||||
ENV PGBOUNCER_TAG=pgbouncer_1_22_1
|
||||
RUN set -e \
|
||||
&& git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
|
||||
&& cd pgbouncer \
|
||||
&& ./autogen.sh \
|
||||
&& LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
|
||||
&& make -j $(nproc) dist_man_MANS= \
|
||||
&& make install dist_man_MANS=
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layers "postgres-exporter" and "sql-exporter"
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
|
||||
FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Clean up postgres folder before inclusion
|
||||
@@ -1073,7 +1113,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
|
||||
COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
|
||||
COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
|
||||
COPY --from=rum-pg-build /rum.tar.gz /ext-src
|
||||
COPY patches/rum.patch /ext-src
|
||||
COPY compute/patches/rum.patch /ext-src
|
||||
#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
|
||||
COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
|
||||
COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
|
||||
@@ -1081,9 +1121,9 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
|
||||
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||
COPY patches/pg_hint_plan.patch /ext-src
|
||||
COPY compute/patches/pg_hint_plan.patch /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
COPY compute/patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
|
||||
COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
|
||||
@@ -1092,7 +1132,7 @@ COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
|
||||
#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
|
||||
#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
|
||||
COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
|
||||
COPY patches/pg_anon.patch /ext-src
|
||||
COPY compute/patches/pg_anon.patch /ext-src
|
||||
COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
|
||||
COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
@@ -1155,9 +1195,23 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# pgbouncer and its config
|
||||
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
|
||||
COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
|
||||
|
||||
# Metrics exporter binaries and configuration files
|
||||
COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
|
||||
COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter
|
||||
|
||||
COPY --chmod=0644 compute/etc/sql_exporter.yml /etc/sql_exporter.yml
|
||||
COPY --chmod=0644 compute/etc/neon_collector.yml /etc/neon_collector.yml
|
||||
COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
|
||||
COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
|
||||
|
||||
# Create remote extension download directory
|
||||
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
|
||||
|
||||
|
||||
# Install:
|
||||
# libreadline8 for psql
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
21
compute/README.md
Normal file
21
compute/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
This directory contains files that are needed to build the compute
|
||||
images, or included in the compute images.
|
||||
|
||||
Dockerfile.compute-node
|
||||
To build the compute image
|
||||
|
||||
vm-image-spec.yaml
|
||||
Instructions for vm-builder, to turn the compute-node image into
|
||||
corresponding vm-compute-node image.
|
||||
|
||||
etc/
|
||||
Configuration files included in /etc in the compute image
|
||||
|
||||
patches/
|
||||
Some extensions need to be patched to work with Neon. This
|
||||
directory contains such patches. They are applied to the extension
|
||||
sources in Dockerfile.compute-node
|
||||
|
||||
In addition to these, postgres itself, the neon postgres extension,
|
||||
and compute_ctl are built and copied into the compute image by
|
||||
Dockerfile.compute-node.
|
||||
247
compute/etc/neon_collector.yml
Normal file
247
compute/etc/neon_collector.yml
Normal file
@@ -0,0 +1,247 @@
|
||||
collector_name: neon_collector
|
||||
metrics:
|
||||
- metric_name: lfc_misses
|
||||
type: gauge
|
||||
help: 'lfc_misses'
|
||||
key_labels:
|
||||
values: [lfc_misses]
|
||||
query: |
|
||||
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
||||
|
||||
- metric_name: lfc_used
|
||||
type: gauge
|
||||
help: 'LFC chunks used (chunk = 1MB)'
|
||||
key_labels:
|
||||
values: [lfc_used]
|
||||
query: |
|
||||
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
||||
|
||||
- metric_name: lfc_hits
|
||||
type: gauge
|
||||
help: 'lfc_hits'
|
||||
key_labels:
|
||||
values: [lfc_hits]
|
||||
query: |
|
||||
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
||||
|
||||
- metric_name: lfc_writes
|
||||
type: gauge
|
||||
help: 'lfc_writes'
|
||||
key_labels:
|
||||
values: [lfc_writes]
|
||||
query: |
|
||||
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
||||
|
||||
- metric_name: lfc_cache_size_limit
|
||||
type: gauge
|
||||
help: 'LFC cache size limit in bytes'
|
||||
key_labels:
|
||||
values: [lfc_cache_size_limit]
|
||||
query: |
|
||||
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
||||
|
||||
- metric_name: connection_counts
|
||||
type: gauge
|
||||
help: 'Connection counts'
|
||||
key_labels:
|
||||
- datname
|
||||
- state
|
||||
values: [count]
|
||||
query: |
|
||||
select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
|
||||
|
||||
- metric_name: pg_stats_userdb
|
||||
type: gauge
|
||||
help: 'Stats for several oldest non-system dbs'
|
||||
key_labels:
|
||||
- datname
|
||||
value_label: kind
|
||||
values:
|
||||
- db_size
|
||||
- deadlocks
|
||||
# Rows
|
||||
- inserted
|
||||
- updated
|
||||
- deleted
|
||||
# We export stats for 10 non-system database. Without this limit
|
||||
# it is too easy to abuse the system by creating lots of databases.
|
||||
query: |
|
||||
select pg_database_size(datname) as db_size, deadlocks,
|
||||
tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
|
||||
datname
|
||||
from pg_stat_database
|
||||
where datname IN (
|
||||
select datname
|
||||
from pg_database
|
||||
where datname <> 'postgres' and not datistemplate
|
||||
order by oid
|
||||
limit 10
|
||||
);
|
||||
|
||||
- metric_name: max_cluster_size
|
||||
type: gauge
|
||||
help: 'neon.max_cluster_size setting'
|
||||
key_labels:
|
||||
values: [max_cluster_size]
|
||||
query: |
|
||||
select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
|
||||
|
||||
- metric_name: db_total_size
|
||||
type: gauge
|
||||
help: 'Size of all databases'
|
||||
key_labels:
|
||||
values: [total]
|
||||
query: |
|
||||
select sum(pg_database_size(datname)) as total from pg_database;
|
||||
|
||||
# DEPRECATED
|
||||
- metric_name: lfc_approximate_working_set_size
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels:
|
||||
values: [approximate_working_set_size]
|
||||
query: |
|
||||
select neon.approximate_working_set_size(false) as approximate_working_set_size;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration]
|
||||
values: [size]
|
||||
# NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
|
||||
# of durations in a pretty-printed form.
|
||||
query: |
|
||||
select
|
||||
x as duration,
|
||||
neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
|
||||
from
|
||||
(values ('5m'),('15m'),('1h')) as t (x);
|
||||
|
||||
- metric_name: compute_current_lsn
|
||||
type: gauge
|
||||
help: 'Current LSN of the database'
|
||||
key_labels:
|
||||
values: [lsn]
|
||||
query: |
|
||||
select
|
||||
case
|
||||
when pg_catalog.pg_is_in_recovery()
|
||||
then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
|
||||
else (pg_current_wal_lsn() - '0/0')::FLOAT8
|
||||
end as lsn;
|
||||
|
||||
- metric_name: compute_receive_lsn
|
||||
type: gauge
|
||||
help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
|
||||
key_labels:
|
||||
values: [lsn]
|
||||
query: |
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_catalog.pg_is_in_recovery()
|
||||
THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
|
||||
ELSE 0
|
||||
END AS lsn;
|
||||
|
||||
- metric_name: replication_delay_bytes
|
||||
type: gauge
|
||||
help: 'Bytes between received and replayed LSN'
|
||||
key_labels:
|
||||
values: [replication_delay_bytes]
|
||||
# We use a GREATEST call here because this calculation can be negative.
|
||||
# The calculation is not atomic, meaning after we've gotten the receive
|
||||
# LSN, the replay LSN may have advanced past the receive LSN we
|
||||
# are using for the calculation.
|
||||
query: |
|
||||
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||
|
||||
- metric_name: replication_delay_seconds
|
||||
type: gauge
|
||||
help: 'Time since last LSN was replayed'
|
||||
key_labels:
|
||||
values: [replication_delay_seconds]
|
||||
query: |
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
|
||||
ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
|
||||
END AS replication_delay_seconds;
|
||||
|
||||
- metric_name: checkpoints_req
|
||||
type: gauge
|
||||
help: 'Number of requested checkpoints'
|
||||
key_labels:
|
||||
values: [checkpoints_req]
|
||||
query: |
|
||||
SELECT checkpoints_req FROM pg_stat_bgwriter;
|
||||
|
||||
- metric_name: checkpoints_timed
|
||||
type: gauge
|
||||
help: 'Number of scheduled checkpoints'
|
||||
key_labels:
|
||||
values: [checkpoints_timed]
|
||||
query: |
|
||||
SELECT checkpoints_timed FROM pg_stat_bgwriter;
|
||||
|
||||
- metric_name: compute_logical_snapshot_files
|
||||
type: gauge
|
||||
help: 'Number of snapshot files in pg_logical/snapshot'
|
||||
key_labels:
|
||||
- timeline_id
|
||||
values: [num_logical_snapshot_files]
|
||||
query: |
|
||||
SELECT
|
||||
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
|
||||
-- temporary snapshot files are renamed to the actual snapshot files after they are
|
||||
-- completely built. We only WAL-log the completely built snapshot files.
|
||||
(SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
|
||||
|
||||
# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
|
||||
# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
|
||||
|
||||
# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
|
||||
- metric_name: logical_slot_restart_lsn
|
||||
type: gauge
|
||||
help: 'restart_lsn of logical slots'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [restart_lsn]
|
||||
query: |
|
||||
select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
|
||||
from pg_replication_slots
|
||||
where slot_type = 'logical';
|
||||
|
||||
- metric_name: compute_subscriptions_count
|
||||
type: gauge
|
||||
help: 'Number of logical replication subscriptions grouped by enabled/disabled'
|
||||
key_labels:
|
||||
- enabled
|
||||
values: [subscriptions_count]
|
||||
query: |
|
||||
select subenabled::text as enabled, count(*) as subscriptions_count
|
||||
from pg_subscription
|
||||
group by subenabled;
|
||||
|
||||
- metric_name: retained_wal
|
||||
type: gauge
|
||||
help: 'Retained WAL in inactive replication slots'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [retained_wal]
|
||||
query: |
|
||||
SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
|
||||
FROM pg_replication_slots
|
||||
WHERE active = false;
|
||||
|
||||
- metric_name: wal_is_lost
|
||||
type: gauge
|
||||
help: 'Whether or not the replication slot wal_status is lost'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [wal_is_lost]
|
||||
query: |
|
||||
SELECT slot_name,
|
||||
CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
|
||||
FROM pg_replication_slots;
|
||||
|
||||
55
compute/etc/neon_collector_autoscaling.yml
Normal file
55
compute/etc/neon_collector_autoscaling.yml
Normal file
@@ -0,0 +1,55 @@
|
||||
collector_name: neon_collector_autoscaling
|
||||
metrics:
|
||||
- metric_name: lfc_misses
|
||||
type: gauge
|
||||
help: 'lfc_misses'
|
||||
key_labels:
|
||||
values: [lfc_misses]
|
||||
query: |
|
||||
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
||||
|
||||
- metric_name: lfc_used
|
||||
type: gauge
|
||||
help: 'LFC chunks used (chunk = 1MB)'
|
||||
key_labels:
|
||||
values: [lfc_used]
|
||||
query: |
|
||||
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
||||
|
||||
- metric_name: lfc_hits
|
||||
type: gauge
|
||||
help: 'lfc_hits'
|
||||
key_labels:
|
||||
values: [lfc_hits]
|
||||
query: |
|
||||
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
||||
|
||||
- metric_name: lfc_writes
|
||||
type: gauge
|
||||
help: 'lfc_writes'
|
||||
key_labels:
|
||||
values: [lfc_writes]
|
||||
query: |
|
||||
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
||||
|
||||
- metric_name: lfc_cache_size_limit
|
||||
type: gauge
|
||||
help: 'LFC cache size limit in bytes'
|
||||
key_labels:
|
||||
values: [lfc_cache_size_limit]
|
||||
query: |
|
||||
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration_seconds]
|
||||
values: [size]
|
||||
# NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
|
||||
# size looking back 1..60 minutes, labeled with the number of minutes.
|
||||
query: |
|
||||
select
|
||||
x::text as duration_seconds,
|
||||
neon.approximate_working_set_size_seconds(x) as size
|
||||
from
|
||||
(select generate_series * 60 as x from generate_series(1, 60)) as t (x);
|
||||
17
compute/etc/pgbouncer.ini
Normal file
17
compute/etc/pgbouncer.ini
Normal file
@@ -0,0 +1,17 @@
|
||||
[databases]
|
||||
*=host=localhost port=5432 auth_user=cloud_admin
|
||||
[pgbouncer]
|
||||
listen_port=6432
|
||||
listen_addr=0.0.0.0
|
||||
auth_type=scram-sha-256
|
||||
auth_user=cloud_admin
|
||||
auth_dbname=postgres
|
||||
client_tls_sslmode=disable
|
||||
server_tls_sslmode=disable
|
||||
pool_mode=transaction
|
||||
max_client_conn=10000
|
||||
default_pool_size=64
|
||||
max_prepared_statements=0
|
||||
admin_users=postgres
|
||||
unix_socket_dir=/tmp/
|
||||
unix_socket_mode=0777
|
||||
33
compute/etc/sql_exporter.yml
Normal file
33
compute/etc/sql_exporter.yml
Normal file
@@ -0,0 +1,33 @@
|
||||
# Configuration for sql_exporter
|
||||
# Global defaults.
|
||||
global:
|
||||
# If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: 10s
|
||||
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: 500ms
|
||||
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: 0s
|
||||
# Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
# as will concurrent scrapes.
|
||||
max_connections: 1
|
||||
# Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
# always be the same as max_connections.
|
||||
max_idle_connections: 1
|
||||
# Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
# If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: 5m
|
||||
|
||||
# The target to monitor and the collectors to execute on it.
|
||||
target:
|
||||
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
# the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
|
||||
|
||||
# Collectors (referenced by name) to execute on the target.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [neon_collector]
|
||||
|
||||
# Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files:
|
||||
- "neon_collector.yml"
|
||||
33
compute/etc/sql_exporter_autoscaling.yml
Normal file
33
compute/etc/sql_exporter_autoscaling.yml
Normal file
@@ -0,0 +1,33 @@
|
||||
# Configuration for sql_exporter for autoscaling-agent
|
||||
# Global defaults.
|
||||
global:
|
||||
# If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: 10s
|
||||
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: 500ms
|
||||
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: 0s
|
||||
# Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
# as will concurrent scrapes.
|
||||
max_connections: 1
|
||||
# Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
# always be the same as max_connections.
|
||||
max_idle_connections: 1
|
||||
# Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
# If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: 5m
|
||||
|
||||
# The target to monitor and the collectors to execute on it.
|
||||
target:
|
||||
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
# the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
|
||||
|
||||
# Collectors (referenced by name) to execute on the target.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [neon_collector_autoscaling]
|
||||
|
||||
# Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files:
|
||||
- "neon_collector_autoscaling.yml"
|
||||
3949
compute/patches/cloud_regress_pg16.patch
Normal file
3949
compute/patches/cloud_regress_pg16.patch
Normal file
File diff suppressed because it is too large
Load Diff
112
compute/vm-image-spec.yaml
Normal file
112
compute/vm-image-spec.yaml
Normal file
@@ -0,0 +1,112 @@
|
||||
# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image.
|
||||
---
|
||||
commands:
|
||||
- name: cgconfigparser
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
|
||||
# restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
|
||||
# running it as root.
|
||||
- name: chmod-resize-swap
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'chmod 711 /neonvm/bin/resize-swap'
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
|
||||
- name: sql-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
|
||||
- name: sql-exporter-autoscaling
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
- filename: compute_ctl-resize-swap
|
||||
content: |
|
||||
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
|
||||
# as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
group neon-postgres {
|
||||
perm {
|
||||
admin {
|
||||
uid = postgres;
|
||||
}
|
||||
task {
|
||||
gid = users;
|
||||
}
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
|
||||
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
|
||||
# requires cgroup v2, so we'll build cgroup-tools ourselves.
|
||||
FROM debian:bullseye-slim as libcgroup-builder
|
||||
ENV LIBCGROUP_VERSION=v2.0.3
|
||||
|
||||
RUN set -exu \
|
||||
&& apt update \
|
||||
&& apt install --no-install-recommends -y \
|
||||
git \
|
||||
ca-certificates \
|
||||
automake \
|
||||
cmake \
|
||||
make \
|
||||
gcc \
|
||||
byacc \
|
||||
flex \
|
||||
libtool \
|
||||
libpam0g-dev \
|
||||
&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
|
||||
&& INSTALL_DIR="/libcgroup-install" \
|
||||
&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
|
||||
&& cd libcgroup \
|
||||
# extracted from bootstrap.sh, with modified flags:
|
||||
&& (test -d m4 || mkdir m4) \
|
||||
&& autoreconf -fi \
|
||||
&& rm -rf autom4te.cache \
|
||||
&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
|
||||
# actually build the thing...
|
||||
&& make install
|
||||
merge: |
|
||||
# tweak nofile limits
|
||||
RUN set -e \
|
||||
&& echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \
|
||||
&& test ! -e /etc/security || ( \
|
||||
echo '* - nofile 1048576' >>/etc/security/limits.conf \
|
||||
&& echo 'root - nofile 1048576' >>/etc/security/limits.conf \
|
||||
)
|
||||
|
||||
# Allow postgres user (compute_ctl) to run swap resizer.
|
||||
# Need to install sudo in order to allow this.
|
||||
#
|
||||
# Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install --no-install-recommends -y \
|
||||
sudo \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
|
||||
|
||||
COPY cgconfig.conf /etc/cgconfig.conf
|
||||
|
||||
RUN set -e \
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
@@ -11,7 +11,6 @@ testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
chrono.workspace = true
|
||||
cfg-if.workspace = true
|
||||
clap.workspace = true
|
||||
@@ -24,7 +23,6 @@ num_cpus.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
postgres.workspace = true
|
||||
regex.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
tar.workspace = true
|
||||
@@ -43,7 +41,6 @@ url.workspace = true
|
||||
compute_api.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||
zstd = "0.13"
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;
|
||||
@@ -793,6 +793,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
include_str!(
|
||||
"./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
|
||||
),
|
||||
include_str!(
|
||||
"./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
|
||||
),
|
||||
];
|
||||
|
||||
MigrationRunner::new(client, &migrations).run_migrations()?;
|
||||
|
||||
@@ -9,13 +9,10 @@ anyhow.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
hex.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
regex.workspace = true
|
||||
@@ -23,8 +20,6 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
toml.workspace = true
|
||||
toml_edit.workspace = true
|
||||
|
||||
@@ -151,7 +151,7 @@ where
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(RETRY_INTERVAL);
|
||||
tokio::time::sleep(RETRY_INTERVAL).await;
|
||||
}
|
||||
Err(e) => {
|
||||
println!("error starting process {process_name:?}: {e:#}");
|
||||
|
||||
@@ -34,12 +34,14 @@ use safekeeper_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||
};
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
||||
use tokio::task::JoinSet;
|
||||
use url::Host;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
@@ -87,34 +89,35 @@ fn main() -> Result<()> {
|
||||
|
||||
// Check for 'neon init' command first.
|
||||
let subcommand_result = if sub_name == "init" {
|
||||
handle_init(sub_args).map(Some)
|
||||
handle_init(sub_args).map(|env| Some(Cow::Owned(env)))
|
||||
} else {
|
||||
// all other commands need an existing config
|
||||
let mut env =
|
||||
LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
|
||||
let original_env = env.clone();
|
||||
|
||||
let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
|
||||
let original_env = env.clone();
|
||||
let env = Box::leak(Box::new(env));
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let subcommand_result = match sub_name {
|
||||
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
|
||||
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
|
||||
"start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
|
||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
|
||||
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
|
||||
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
|
||||
"mappings" => handle_mappings(sub_args, &mut env),
|
||||
"tenant" => rt.block_on(handle_tenant(sub_args, env)),
|
||||
"timeline" => rt.block_on(handle_timeline(sub_args, env)),
|
||||
"start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))),
|
||||
"stop" => rt.block_on(handle_stop_all(sub_args, env)),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, env)),
|
||||
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)),
|
||||
"storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)),
|
||||
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)),
|
||||
"endpoint" => rt.block_on(handle_endpoint(sub_args, env)),
|
||||
"mappings" => handle_mappings(sub_args, env),
|
||||
"pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
|
||||
_ => bail!("unexpected subcommand {sub_name}"),
|
||||
};
|
||||
|
||||
if original_env != env {
|
||||
subcommand_result.map(|()| Some(env))
|
||||
if &original_env != env {
|
||||
subcommand_result.map(|()| Some(Cow::Borrowed(env)))
|
||||
} else {
|
||||
subcommand_result.map(|()| None)
|
||||
}
|
||||
@@ -1245,49 +1248,122 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let (sub_name, sub_args) = match sub_match.subcommand() {
|
||||
Some(broker_command_data) => broker_command_data,
|
||||
None => bail!("no broker subcommand provided"),
|
||||
};
|
||||
|
||||
match sub_name {
|
||||
"start" => {
|
||||
if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await {
|
||||
eprintln!("broker start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
"stop" => {
|
||||
if let Err(e) = broker::stop_broker_process(env) {
|
||||
eprintln!("broker stop failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
_ => bail!("Unexpected broker subcommand '{}'", sub_name),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_start_all(
|
||||
env: &local_env::LocalEnv,
|
||||
env: &'static local_env::LocalEnv,
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else {
|
||||
neon_start_status_check(env, retry_timeout)
|
||||
.await
|
||||
.context("status check after successful startup of all services")?;
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
eprintln!("startup failed because one or more services could not be started");
|
||||
|
||||
for e in errors {
|
||||
eprintln!("{e}");
|
||||
let debug_repr = format!("{e:?}");
|
||||
for line in debug_repr.lines() {
|
||||
eprintln!(" {line}");
|
||||
}
|
||||
}
|
||||
|
||||
try_stop_all(env, true).await;
|
||||
|
||||
exit(2);
|
||||
}
|
||||
|
||||
/// Returns Ok() if and only if all services could be started successfully.
|
||||
/// Otherwise, returns the list of errors that occurred during startup.
|
||||
async fn handle_start_all_impl(
|
||||
env: &'static local_env::LocalEnv,
|
||||
retry_timeout: Duration,
|
||||
) -> Result<(), Vec<anyhow::Error>> {
|
||||
// Endpoints are not started automatically
|
||||
|
||||
broker::start_broker_process(env, retry_timeout).await?;
|
||||
let mut js = JoinSet::new();
|
||||
|
||||
// Only start the storage controller if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller
|
||||
.start(NeonStorageControllerStartArgs::with_default_instance_id(
|
||||
(*retry_timeout).into(),
|
||||
))
|
||||
.await
|
||||
{
|
||||
eprintln!("storage_controller start failed: {:#}", e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
// force infalliblity through closure
|
||||
#[allow(clippy::redundant_closure_call)]
|
||||
(|| {
|
||||
js.spawn(async move {
|
||||
let retry_timeout = retry_timeout;
|
||||
broker::start_broker_process(env, &retry_timeout).await
|
||||
});
|
||||
|
||||
// Only start the storage controller if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
js.spawn(async move {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.start(NeonStorageControllerStartArgs::with_default_instance_id(
|
||||
retry_timeout.into(),
|
||||
))
|
||||
.await
|
||||
.map_err(|e| e.context("start storage_controller"))
|
||||
});
|
||||
}
|
||||
|
||||
for ps_conf in &env.pageservers {
|
||||
js.spawn(async move {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
pageserver
|
||||
.start(&retry_timeout)
|
||||
.await
|
||||
.map_err(|e| e.context(format!("start pageserver {}", ps_conf.id)))
|
||||
});
|
||||
}
|
||||
|
||||
for node in env.safekeepers.iter() {
|
||||
js.spawn(async move {
|
||||
let safekeeper = SafekeeperNode::from_env(env, node);
|
||||
safekeeper
|
||||
.start(vec![], &retry_timeout)
|
||||
.await
|
||||
.map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
|
||||
});
|
||||
}
|
||||
})();
|
||||
|
||||
let mut errors = Vec::new();
|
||||
while let Some(result) = js.join_next().await {
|
||||
let result = result.expect("we don't panic or cancel the tasks");
|
||||
if let Err(e) = result {
|
||||
errors.push(e);
|
||||
}
|
||||
}
|
||||
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver.start(retry_timeout).await {
|
||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
}
|
||||
if !errors.is_empty() {
|
||||
return Err(errors);
|
||||
}
|
||||
|
||||
for node in env.safekeepers.iter() {
|
||||
let safekeeper = SafekeeperNode::from_env(env, node);
|
||||
if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
|
||||
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
|
||||
try_stop_all(env, false).await;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
neon_start_status_check(env, retry_timeout).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1672,6 +1748,19 @@ fn cli() -> Command {
|
||||
.arg(stop_mode_arg.clone())
|
||||
.arg(instance_id))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("storage_broker")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage broker")
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start broker")
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop broker")
|
||||
.arg(stop_mode_arg.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("safekeeper")
|
||||
.arg_required_else_help(true)
|
||||
|
||||
@@ -702,7 +702,7 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
}
|
||||
std::thread::sleep(ATTEMPT_INTERVAL);
|
||||
tokio::time::sleep(ATTEMPT_INTERVAL).await;
|
||||
}
|
||||
|
||||
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
|
||||
|
||||
@@ -17,9 +17,7 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::models::{
|
||||
self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -324,22 +322,6 @@ impl PageServerNode {
|
||||
background_process::stop_process(immediate, "pageserver", &self.pid_file())
|
||||
}
|
||||
|
||||
pub async fn page_server_psql_client(
|
||||
&self,
|
||||
) -> anyhow::Result<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
)> {
|
||||
let mut config = self.pg_connection_config.clone();
|
||||
if self.conf.pg_auth_type == AuthType::NeonJWT {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
|
||||
config = config.set_password(Some(token));
|
||||
}
|
||||
Ok(config.connect_no_tls().await?)
|
||||
}
|
||||
|
||||
pub async fn check_status(&self) -> mgmt_api::Result<()> {
|
||||
self.http_client.status().await
|
||||
}
|
||||
@@ -540,19 +522,6 @@ impl PageServerNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn location_config(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<Duration>,
|
||||
lazy: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.location_config(tenant_shard_id, config, flush_ms, lazy)
|
||||
.await?)
|
||||
}
|
||||
|
||||
pub async fn timeline_list(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
@@ -636,14 +605,4 @@ impl PageServerNode {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_synthetic_size(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> anyhow::Result<TenantHistorySize> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.tenant_synthetic_size(tenant_shard_id)
|
||||
.await?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,13 +4,10 @@
|
||||
/// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
|
||||
/// enough to extract a few settings we need in Neon, assuming you don't do
|
||||
/// funny stuff like include-directives or funny escaping.
|
||||
use anyhow::{bail, Context, Result};
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::io::BufRead;
|
||||
use std::str::FromStr;
|
||||
|
||||
/// In-memory representation of a postgresql.conf file
|
||||
#[derive(Default, Debug)]
|
||||
@@ -19,84 +16,16 @@ pub struct PostgresConf {
|
||||
hash: HashMap<String, String>,
|
||||
}
|
||||
|
||||
static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
|
||||
|
||||
impl PostgresConf {
|
||||
pub fn new() -> PostgresConf {
|
||||
PostgresConf::default()
|
||||
}
|
||||
|
||||
/// Read file into memory
|
||||
pub fn read(read: impl std::io::Read) -> Result<PostgresConf> {
|
||||
let mut result = Self::new();
|
||||
|
||||
for line in std::io::BufReader::new(read).lines() {
|
||||
let line = line?;
|
||||
|
||||
// Store each line in a vector, in original format
|
||||
result.lines.push(line.clone());
|
||||
|
||||
// Also parse each line and insert key=value lines into a hash map.
|
||||
//
|
||||
// FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL.
|
||||
// But it's close enough for our usage.
|
||||
let line = line.trim();
|
||||
if line.starts_with('#') {
|
||||
// comment, ignore
|
||||
continue;
|
||||
} else if let Some(caps) = CONF_LINE_RE.captures(line) {
|
||||
let name = caps.get(1).unwrap().as_str();
|
||||
let raw_val = caps.get(2).unwrap().as_str();
|
||||
|
||||
if let Ok(val) = deescape_str(raw_val) {
|
||||
// Note: if there's already an entry in the hash map for
|
||||
// this key, this will replace it. That's the behavior what
|
||||
// we want; when PostgreSQL reads the file, each line
|
||||
// overrides any previous value for the same setting.
|
||||
result.hash.insert(name.to_string(), val.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Return the current value of 'option'
|
||||
pub fn get(&self, option: &str) -> Option<&str> {
|
||||
self.hash.get(option).map(|x| x.as_ref())
|
||||
}
|
||||
|
||||
/// Return the current value of a field, parsed to the right datatype.
|
||||
///
|
||||
/// This calls the FromStr::parse() function on the value of the field. If
|
||||
/// the field does not exist, or parsing fails, returns an error.
|
||||
///
|
||||
pub fn parse_field<T>(&self, field_name: &str, context: &str) -> Result<T>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
self.get(field_name)
|
||||
.with_context(|| format!("could not find '{}' option {}", field_name, context))?
|
||||
.parse::<T>()
|
||||
.with_context(|| format!("could not parse '{}' option {}", field_name, context))
|
||||
}
|
||||
|
||||
pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
if let Some(val) = self.get(field_name) {
|
||||
let result = val
|
||||
.parse::<T>()
|
||||
.with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
|
||||
|
||||
Ok(Some(result))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Note: if you call this multiple times for the same option, the config
|
||||
/// file will a line for each call. It would be nice to have a function
|
||||
@@ -154,48 +83,8 @@ fn escape_str(s: &str) -> String {
|
||||
}
|
||||
}
|
||||
|
||||
/// De-escape a possibly-quoted value.
|
||||
///
|
||||
/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL
|
||||
/// does this.
|
||||
fn deescape_str(s: &str) -> Result<String> {
|
||||
// If the string has a quote at the beginning and end, strip them out.
|
||||
if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') {
|
||||
let mut result = String::new();
|
||||
|
||||
let mut iter = s[1..(s.len() - 1)].chars().peekable();
|
||||
while let Some(c) = iter.next() {
|
||||
let newc = if c == '\\' {
|
||||
match iter.next() {
|
||||
Some('b') => '\x08',
|
||||
Some('f') => '\x0c',
|
||||
Some('n') => '\n',
|
||||
Some('r') => '\r',
|
||||
Some('t') => '\t',
|
||||
Some('0'..='7') => {
|
||||
// TODO
|
||||
bail!("octal escapes not supported");
|
||||
}
|
||||
Some(n) => n,
|
||||
None => break,
|
||||
}
|
||||
} else if c == '\'' && iter.peek() == Some(&'\'') {
|
||||
// doubled quote becomes just one quote
|
||||
iter.next().unwrap()
|
||||
} else {
|
||||
c
|
||||
};
|
||||
|
||||
result.push(newc);
|
||||
}
|
||||
Ok(result)
|
||||
} else {
|
||||
Ok(s.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_postgresql_conf_escapes() -> Result<()> {
|
||||
fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
|
||||
assert_eq!(escape_str("foo bar"), "'foo bar'");
|
||||
// these don't need to be quoted
|
||||
assert_eq!(escape_str("foo"), "foo");
|
||||
@@ -214,13 +103,5 @@ fn test_postgresql_conf_escapes() -> Result<()> {
|
||||
assert_eq!(escape_str("fo\\o"), "'fo\\\\o'");
|
||||
assert_eq!(escape_str("10 cats"), "'10 cats'");
|
||||
|
||||
// Test de-escaping
|
||||
assert_eq!(deescape_str(&escape_str("foo"))?, "foo");
|
||||
assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r");
|
||||
assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t");
|
||||
|
||||
// octal-escapes are currently not supported
|
||||
assert!(deescape_str("'foo\\7\\07\\007'").is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -346,7 +346,14 @@ impl StorageController {
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()];
|
||||
let initdb_args = [
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"--username",
|
||||
&username(),
|
||||
"--no-sync",
|
||||
"--no-instructions",
|
||||
];
|
||||
tracing::info!(
|
||||
"Initializing storage controller database with args: {:?}",
|
||||
initdb_args
|
||||
|
||||
@@ -11,14 +11,11 @@ clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
reqwest.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
storage_controller_client.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
# Example docker compose configuration
|
||||
|
||||
The configuration in this directory is used for testing Neon docker images: it is
|
||||
not intended for deploying a usable system. To run a development environment where
|
||||
you can experiment with a minature Neon system, use `cargo neon` rather than container images.
|
||||
not intended for deploying a usable system. To run a development environment where
|
||||
you can experiment with a miniature Neon system, use `cargo neon` rather than container images.
|
||||
|
||||
This configuration does not start the storage controller, because the controller
|
||||
needs a way to reconfigure running computes, and no such thing exists in this setup.
|
||||
|
||||
@@ -8,7 +8,6 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
chrono.workspace = true
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
serde_json.workspace = true
|
||||
regex.workspace = true
|
||||
|
||||
|
||||
@@ -5,9 +5,6 @@ edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
|
||||
#[serde(tag = "type")]
|
||||
pub enum EventType {
|
||||
#[serde(rename = "absolute")]
|
||||
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;
|
||||
|
||||
// Just a wrapper around a slice of events
|
||||
// to serialize it as `{"events" : [ ] }
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
#[derive(serde::Serialize, Deserialize)]
|
||||
pub struct EventChunk<'a, T: Clone> {
|
||||
pub events: std::borrow::Cow<'a, [T]>,
|
||||
}
|
||||
|
||||
@@ -12,5 +12,4 @@ bytes.workspace = true
|
||||
utils.workspace = true
|
||||
parking_lot.workspace = true
|
||||
hex.workspace = true
|
||||
scopeguard.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
|
||||
@@ -104,9 +104,6 @@ pub struct ConfigToml {
|
||||
pub image_compression: ImageCompressionAlgorithm,
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
pub l0_flush: Option<crate::models::L0FlushConfig>,
|
||||
#[serde(skip_serializing)]
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
|
||||
pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
|
||||
pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
|
||||
pub io_buffer_alignment: usize,
|
||||
}
|
||||
@@ -173,40 +170,6 @@ impl Default for EvictionOrder {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Eq,
|
||||
PartialEq,
|
||||
Debug,
|
||||
Copy,
|
||||
Clone,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
pub enum GetVectoredImpl {
|
||||
Sequential,
|
||||
Vectored,
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Eq,
|
||||
PartialEq,
|
||||
Debug,
|
||||
Copy,
|
||||
Clone,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
pub enum GetImpl {
|
||||
Legacy,
|
||||
Vectored,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct MaxVectoredReadBytes(pub NonZeroUsize);
|
||||
@@ -338,8 +301,6 @@ pub mod defaults {
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||
ImageCompressionAlgorithm::Zstd { level: Some(1) };
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
|
||||
@@ -376,7 +337,10 @@ impl Default for ConfigToml {
|
||||
|
||||
concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
|
||||
.expect("Invalid default constant")),
|
||||
concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(
|
||||
DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES,
|
||||
)
|
||||
.unwrap(),
|
||||
metric_collection_interval: (humantime::parse_duration(
|
||||
DEFAULT_METRIC_COLLECTION_INTERVAL,
|
||||
)
|
||||
@@ -417,7 +381,6 @@ impl Default for ConfigToml {
|
||||
image_compression: (DEFAULT_IMAGE_COMPRESSION),
|
||||
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: None,
|
||||
compact_level0_phase1_value_access: Default::default(),
|
||||
virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
|
||||
|
||||
io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
@@ -467,8 +430,6 @@ pub mod tenant_conf_defaults {
|
||||
// By default ingest enough WAL for two new L0 layers before checking if new image
|
||||
// image layers should be created.
|
||||
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
}
|
||||
|
||||
impl Default for TenantConfigToml {
|
||||
|
||||
@@ -37,14 +37,11 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
/// ```mermaid
|
||||
/// stateDiagram-v2
|
||||
///
|
||||
/// [*] --> Loading: spawn_load()
|
||||
/// [*] --> Attaching: spawn_attach()
|
||||
///
|
||||
/// Loading --> Activating: activate()
|
||||
/// Attaching --> Activating: activate()
|
||||
/// Activating --> Active: infallible
|
||||
///
|
||||
/// Loading --> Broken: load() failure
|
||||
/// Attaching --> Broken: attach() failure
|
||||
///
|
||||
/// Active --> Stopping: set_stopping(), part of shutdown & detach
|
||||
@@ -68,10 +65,6 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
)]
|
||||
#[serde(tag = "slug", content = "data")]
|
||||
pub enum TenantState {
|
||||
/// This tenant is being loaded from local disk.
|
||||
///
|
||||
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
|
||||
Loading,
|
||||
/// This tenant is being attached to the pageserver.
|
||||
///
|
||||
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
|
||||
@@ -121,8 +114,6 @@ impl TenantState {
|
||||
// But, our attach task might still be fetching the remote timelines, etc.
|
||||
// So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
|
||||
Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
|
||||
// tenant mgr startup distinguishes attaching from loading via marker file.
|
||||
Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
|
||||
// We only reach Active after successful load / attach.
|
||||
// So, call atttachment status Attached.
|
||||
Self::Active => Attached,
|
||||
@@ -191,10 +182,11 @@ impl LsnLease {
|
||||
}
|
||||
|
||||
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
|
||||
///
|
||||
/// XXX: We used to have more variants here, but now it's just one, which makes this rather
|
||||
/// useless. Remove, once we've checked that there's no client code left that looks at this.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum ActivatingFrom {
|
||||
/// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
|
||||
Loading,
|
||||
/// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
|
||||
Attaching,
|
||||
}
|
||||
@@ -495,7 +487,7 @@ pub struct CompactionAlgorithmSettings {
|
||||
pub kind: CompactionAlgorithm,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum L0FlushConfig {
|
||||
#[serde(rename_all = "snake_case")]
|
||||
@@ -1562,11 +1554,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tenantstatus_activating_serde() {
|
||||
let states = [
|
||||
TenantState::Activating(ActivatingFrom::Loading),
|
||||
TenantState::Activating(ActivatingFrom::Attaching),
|
||||
];
|
||||
let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
|
||||
let states = [TenantState::Activating(ActivatingFrom::Attaching)];
|
||||
let expected = "[{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
|
||||
|
||||
let actual = serde_json::to_string(&states).unwrap();
|
||||
|
||||
@@ -1581,13 +1570,7 @@ mod tests {
|
||||
fn tenantstatus_activating_strum() {
|
||||
// tests added, because we use these for metrics
|
||||
let examples = [
|
||||
(line!(), TenantState::Loading, "Loading"),
|
||||
(line!(), TenantState::Attaching, "Attaching"),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Activating(ActivatingFrom::Loading),
|
||||
"Activating",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Activating(ActivatingFrom::Attaching),
|
||||
|
||||
@@ -5,10 +5,8 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
futures.workspace = true
|
||||
rustls.workspace = true
|
||||
serde.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -280,16 +280,6 @@ pub struct PostgresBackend<IO> {
|
||||
|
||||
pub type PostgresBackendTCP = PostgresBackend<tokio::net::TcpStream>;
|
||||
|
||||
pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
|
||||
let mut query_string = query_string.to_vec();
|
||||
if let Some(ch) = query_string.last() {
|
||||
if *ch == 0 {
|
||||
query_string.pop();
|
||||
}
|
||||
}
|
||||
query_string
|
||||
}
|
||||
|
||||
/// Cast a byte slice to a string slice, dropping null terminator if there's one.
|
||||
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
|
||||
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
|
||||
|
||||
@@ -5,13 +5,10 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
anyhow.workspace = true
|
||||
crc32c.workspace = true
|
||||
hex.workspace = true
|
||||
once_cell.workspace = true
|
||||
log.workspace = true
|
||||
memoffset.workspace = true
|
||||
|
||||
@@ -9,8 +9,8 @@
|
||||
//! comments on them.
|
||||
//!
|
||||
|
||||
use crate::PageHeaderData;
|
||||
use crate::BLCKSZ;
|
||||
use crate::{PageHeaderData, XLogRecord};
|
||||
|
||||
//
|
||||
// From pg_tablespace_d.h
|
||||
@@ -194,8 +194,6 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
|
||||
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
|
||||
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
|
||||
|
||||
pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
|
||||
|
||||
//
|
||||
// from xlogrecord.h
|
||||
//
|
||||
@@ -219,8 +217,6 @@ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
|
||||
/* From transam.h */
|
||||
pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
|
||||
pub const INVALID_TRANSACTION_ID: u32 = 0;
|
||||
pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
|
||||
pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
|
||||
|
||||
/* pg_control.h */
|
||||
pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
|
||||
|
||||
@@ -26,11 +26,12 @@ use bytes::{Buf, Bytes};
|
||||
use log::*;
|
||||
|
||||
use serde::Serialize;
|
||||
use std::ffi::OsStr;
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::io::ErrorKind;
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::Path;
|
||||
use std::time::SystemTime;
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::bin_ser::SerializeError;
|
||||
@@ -78,19 +79,34 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize
|
||||
)
|
||||
}
|
||||
|
||||
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
|
||||
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
|
||||
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
|
||||
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
|
||||
(log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
|
||||
pub fn XLogFromFileName(
|
||||
fname: &OsStr,
|
||||
wal_seg_size: usize,
|
||||
) -> anyhow::Result<(XLogSegNo, TimeLineID)> {
|
||||
if let Some(fname_str) = fname.to_str() {
|
||||
let tli = u32::from_str_radix(&fname_str[0..8], 16)?;
|
||||
let log = u32::from_str_radix(&fname_str[8..16], 16)? as XLogSegNo;
|
||||
let seg = u32::from_str_radix(&fname_str[16..24], 16)? as XLogSegNo;
|
||||
Ok((log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli))
|
||||
} else {
|
||||
anyhow::bail!("non-ut8 filename: {:?}", fname);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn IsXLogFileName(fname: &str) -> bool {
|
||||
return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
|
||||
pub fn IsXLogFileName(fname: &OsStr) -> bool {
|
||||
if let Some(fname) = fname.to_str() {
|
||||
fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit())
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn IsPartialXLogFileName(fname: &str) -> bool {
|
||||
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
|
||||
pub fn IsPartialXLogFileName(fname: &OsStr) -> bool {
|
||||
if let Some(fname) = fname.to_str() {
|
||||
fname.ends_with(".partial") && IsXLogFileName(OsStr::new(&fname[0..fname.len() - 8]))
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// If LSN points to the beginning of the page, then shift it to first record,
|
||||
@@ -260,13 +276,6 @@ fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result<Option<File>> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn main() {
|
||||
let mut data_dir = PathBuf::new();
|
||||
data_dir.push(".");
|
||||
let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap();
|
||||
println!("wal_end={:?}", wal_end);
|
||||
}
|
||||
|
||||
impl XLogRecord {
|
||||
pub fn from_slice(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
|
||||
use utils::bin_ser::LeSer;
|
||||
|
||||
@@ -9,7 +9,6 @@ anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
env_logger.workspace = true
|
||||
log.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
|
||||
@@ -7,6 +7,7 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
use postgres_ffi::{
|
||||
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
};
|
||||
use std::ffi::OsStr;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -26,7 +27,6 @@ macro_rules! xlog_utils_test {
|
||||
|
||||
postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Conf {
|
||||
pub pg_version: u32,
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
@@ -136,8 +136,8 @@ impl Conf {
|
||||
|
||||
pub fn pg_waldump(
|
||||
&self,
|
||||
first_segment_name: &str,
|
||||
last_segment_name: &str,
|
||||
first_segment_name: &OsStr,
|
||||
last_segment_name: &OsStr,
|
||||
) -> anyhow::Result<std::process::Output> {
|
||||
let first_segment_file = self.datadir.join(first_segment_name);
|
||||
let last_segment_file = self.datadir.join(last_segment_name);
|
||||
|
||||
@@ -4,6 +4,7 @@ use super::*;
|
||||
use crate::{error, info};
|
||||
use regex::Regex;
|
||||
use std::cmp::min;
|
||||
use std::ffi::OsStr;
|
||||
use std::fs::{self, File};
|
||||
use std::io::Write;
|
||||
use std::{env, str::FromStr};
|
||||
@@ -54,7 +55,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
.wal_dir()
|
||||
.read_dir()
|
||||
.unwrap()
|
||||
.map(|f| f.unwrap().file_name().into_string().unwrap())
|
||||
.map(|f| f.unwrap().file_name())
|
||||
.filter(|fname| IsXLogFileName(fname))
|
||||
.max()
|
||||
.unwrap();
|
||||
@@ -70,11 +71,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
start_lsn
|
||||
);
|
||||
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
|
||||
let fname = file.file_name().into_string().unwrap();
|
||||
let fname = file.file_name();
|
||||
if !IsXLogFileName(&fname) {
|
||||
continue;
|
||||
}
|
||||
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
|
||||
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE).unwrap();
|
||||
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
|
||||
if seg_start_lsn > u64::from(*start_lsn) {
|
||||
continue;
|
||||
@@ -93,10 +94,10 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
}
|
||||
}
|
||||
|
||||
fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
|
||||
fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn {
|
||||
// Get the actual end of WAL by pg_waldump
|
||||
let waldump_output = cfg
|
||||
.pg_waldump("000000010000000000000001", last_segment)
|
||||
.pg_waldump(OsStr::new("000000010000000000000001"), last_segment)
|
||||
.unwrap()
|
||||
.stderr;
|
||||
let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
|
||||
@@ -117,7 +118,7 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
|
||||
|
||||
fn check_end_of_wal(
|
||||
cfg: &crate::Conf,
|
||||
last_segment: &str,
|
||||
last_segment: &OsStr,
|
||||
start_lsn: Lsn,
|
||||
expected_end_of_wal: Lsn,
|
||||
) {
|
||||
@@ -132,7 +133,8 @@ fn check_end_of_wal(
|
||||
// Rename file to partial to actually find last valid lsn, then rename it back.
|
||||
fs::rename(
|
||||
cfg.wal_dir().join(last_segment),
|
||||
cfg.wal_dir().join(format!("{}.partial", last_segment)),
|
||||
cfg.wal_dir()
|
||||
.join(format!("{}.partial", last_segment.to_str().unwrap())),
|
||||
)
|
||||
.unwrap();
|
||||
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
|
||||
@@ -142,7 +144,8 @@ fn check_end_of_wal(
|
||||
);
|
||||
assert_eq!(wal_end, expected_end_of_wal);
|
||||
fs::rename(
|
||||
cfg.wal_dir().join(format!("{}.partial", last_segment)),
|
||||
cfg.wal_dir()
|
||||
.join(format!("{}.partial", last_segment.to_str().unwrap())),
|
||||
cfg.wal_dir().join(last_segment),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -8,10 +8,8 @@ license.workspace = true
|
||||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
itertools.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
tokio = { workspace = true, features = ["io-util"] }
|
||||
tracing.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
|
||||
@@ -13,14 +13,11 @@ aws-smithy-async.workspace = true
|
||||
aws-smithy-types.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
aws-credential-types.workspace = true
|
||||
bytes.workspace = true
|
||||
camino = { workspace = true, features = ["serde1"] }
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
futures.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
|
||||
@@ -127,10 +127,6 @@ impl RemotePath {
|
||||
&self.0
|
||||
}
|
||||
|
||||
pub fn extension(&self) -> Option<&str> {
|
||||
self.0.extension()
|
||||
}
|
||||
|
||||
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
|
||||
self.0.strip_prefix(&p.0)
|
||||
}
|
||||
|
||||
@@ -6,6 +6,5 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
const_format.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -9,8 +9,9 @@ hyper.workspace = true
|
||||
opentelemetry = { workspace = true, features=["rt-tokio"] }
|
||||
opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions.workspace = true
|
||||
reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
tracing-subscriber.workspace = true # For examples in docs
|
||||
|
||||
@@ -42,7 +42,6 @@ tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
||||
rand.workspace = true
|
||||
serde_with.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you
|
||||
/// feed the accumulated values by calling the 'accum' function, instead of having an
|
||||
/// iterator.
|
||||
///
|
||||
/// For example, to calculate the smallest value among some integers:
|
||||
///
|
||||
/// ```
|
||||
/// use utils::accum::Accum;
|
||||
///
|
||||
/// let values = [1, 2, 3];
|
||||
///
|
||||
/// let mut min_value: Accum<u32> = Accum(None);
|
||||
/// for new_value in &values {
|
||||
/// min_value.accum(std::cmp::min, *new_value);
|
||||
/// }
|
||||
///
|
||||
/// assert_eq!(min_value.0.unwrap(), 1);
|
||||
/// ```
|
||||
pub struct Accum<T>(pub Option<T>);
|
||||
impl<T: Copy> Accum<T> {
|
||||
pub fn accum<F>(&mut self, func: F, new_value: T)
|
||||
where
|
||||
F: FnOnce(T, T) -> T,
|
||||
{
|
||||
// If there is no previous value, just store the new value.
|
||||
// Otherwise call the function to decide which one to keep.
|
||||
self.0 = Some(if let Some(accum) = self.0 {
|
||||
func(accum, new_value)
|
||||
} else {
|
||||
new_value
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -82,7 +82,7 @@ impl ApiError {
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
format!("{err:#}"), // use alternative formatting so that we give the cause without backtrace
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
),
|
||||
}
|
||||
|
||||
@@ -88,12 +88,6 @@ impl<'de> Deserialize<'de> for Id {
|
||||
}
|
||||
|
||||
impl Id {
|
||||
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
|
||||
let mut arr = [0u8; 16];
|
||||
buf.copy_to_slice(&mut arr);
|
||||
Id::from(arr)
|
||||
}
|
||||
|
||||
pub fn from_slice(src: &[u8]) -> Result<Id, IdError> {
|
||||
if src.len() != 16 {
|
||||
return Err(IdError::SliceParseError(src.len()));
|
||||
@@ -179,10 +173,6 @@ impl fmt::Debug for Id {
|
||||
macro_rules! id_newtype {
|
||||
($t:ident) => {
|
||||
impl $t {
|
||||
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t {
|
||||
$t(Id::get_from_buf(buf))
|
||||
}
|
||||
|
||||
pub fn from_slice(src: &[u8]) -> Result<$t, IdError> {
|
||||
Ok($t(Id::from_slice(src)?))
|
||||
}
|
||||
|
||||
@@ -21,7 +21,13 @@
|
||||
//!
|
||||
//! Another explaination can be found here: <https://brandur.org/rate-limiting>
|
||||
|
||||
use std::{sync::Mutex, time::Duration};
|
||||
use std::{
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Mutex,
|
||||
},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use tokio::{sync::Notify, time::Instant};
|
||||
|
||||
@@ -128,6 +134,7 @@ impl LeakyBucketState {
|
||||
|
||||
pub struct RateLimiter {
|
||||
pub config: LeakyBucketConfig,
|
||||
pub sleep_counter: AtomicU64,
|
||||
pub state: Mutex<LeakyBucketState>,
|
||||
/// a queue to provide this fair ordering.
|
||||
pub queue: Notify,
|
||||
@@ -144,6 +151,7 @@ impl Drop for Requeue<'_> {
|
||||
impl RateLimiter {
|
||||
pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
|
||||
RateLimiter {
|
||||
sleep_counter: AtomicU64::new(0),
|
||||
state: Mutex::new(LeakyBucketState::with_initial_tokens(
|
||||
&config,
|
||||
initial_tokens,
|
||||
@@ -163,15 +171,16 @@ impl RateLimiter {
|
||||
|
||||
/// returns true if we did throttle
|
||||
pub async fn acquire(&self, count: usize) -> bool {
|
||||
let mut throttled = false;
|
||||
|
||||
let start = tokio::time::Instant::now();
|
||||
|
||||
let start_count = self.sleep_counter.load(Ordering::Acquire);
|
||||
let mut end_count = start_count;
|
||||
|
||||
// wait until we are the first in the queue
|
||||
let mut notified = std::pin::pin!(self.queue.notified());
|
||||
if !notified.as_mut().enable() {
|
||||
throttled = true;
|
||||
notified.await;
|
||||
end_count = self.sleep_counter.load(Ordering::Acquire);
|
||||
}
|
||||
|
||||
// notify the next waiter in the queue when we are done.
|
||||
@@ -184,9 +193,22 @@ impl RateLimiter {
|
||||
.unwrap()
|
||||
.add_tokens(&self.config, start, count as f64);
|
||||
match res {
|
||||
Ok(()) => return throttled,
|
||||
Ok(()) => return end_count > start_count,
|
||||
Err(ready_at) => {
|
||||
throttled = true;
|
||||
struct Increment<'a>(&'a AtomicU64);
|
||||
|
||||
impl Drop for Increment<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.0.fetch_add(1, Ordering::AcqRel);
|
||||
}
|
||||
}
|
||||
|
||||
// increment the counter after we finish sleeping (or cancel this task).
|
||||
// this ensures that tasks that have already started the acquire will observe
|
||||
// the new sleep count when they are allowed to resume on the notify.
|
||||
let _inc = Increment(&self.sleep_counter);
|
||||
end_count += 1;
|
||||
|
||||
tokio::time::sleep_until(ready_at).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,16 +43,9 @@ pub mod logging;
|
||||
pub mod lock_file;
|
||||
pub mod pid_file;
|
||||
|
||||
// Misc
|
||||
pub mod accum;
|
||||
pub mod shutdown;
|
||||
|
||||
// Utility for binding TcpListeners with proper socket options.
|
||||
pub mod tcp_listener;
|
||||
|
||||
// Utility for putting a raw file descriptor into non-blocking mode
|
||||
pub mod nonblock;
|
||||
|
||||
// Default signal handling
|
||||
pub mod sentry_init;
|
||||
pub mod signals;
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use camino::Utf8Path;
|
||||
use serde::{de::Visitor, Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::ops::{Add, AddAssign};
|
||||
@@ -145,14 +144,6 @@ impl Lsn {
|
||||
i128::from(self.0) - i128::from(other)
|
||||
}
|
||||
|
||||
/// Parse an LSN from a filename in the form `0000000000000000`
|
||||
pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
|
||||
where
|
||||
F: AsRef<Utf8Path>,
|
||||
{
|
||||
Lsn::from_hex(filename.as_ref().as_str())
|
||||
}
|
||||
|
||||
/// Parse an LSN from a string in the form `0000000000000000`
|
||||
pub fn from_hex<S>(s: S) -> Result<Self, LsnParseError>
|
||||
where
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL};
|
||||
use std::os::unix::io::RawFd;
|
||||
|
||||
/// Put a file descriptor into non-blocking mode
|
||||
pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
|
||||
let bits = fcntl(fd, F_GETFL)?;
|
||||
|
||||
// If F_GETFL returns some unknown bits, they should be valid
|
||||
// for passing back to F_SETFL, too. If we left them out, the F_SETFL
|
||||
// would effectively clear them, which is not what we want.
|
||||
let mut flags = OFlag::from_bits_retain(bits);
|
||||
flags |= OFlag::O_NONBLOCK;
|
||||
|
||||
fcntl(fd, F_SETFL(flags))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
/// Immediately terminate the calling process without calling
|
||||
/// atexit callbacks, C runtime destructors etc. We mainly use
|
||||
/// this to protect coverage data from concurrent writes.
|
||||
pub fn exit_now(code: u8) -> ! {
|
||||
// SAFETY: exiting is safe, the ffi is not safe
|
||||
unsafe { nix::libc::_exit(code as _) };
|
||||
}
|
||||
@@ -120,32 +120,6 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
Ok((None, delta_size))
|
||||
}
|
||||
|
||||
/// Split the map into two.
|
||||
///
|
||||
/// The left map contains everything before `cutoff` (exclusive).
|
||||
/// Right map contains `cutoff` and everything after (inclusive).
|
||||
pub fn split_at(&self, cutoff: &K) -> (Self, Self)
|
||||
where
|
||||
K: Clone,
|
||||
V: Clone,
|
||||
{
|
||||
let split_idx = self
|
||||
.data
|
||||
.binary_search_by_key(&cutoff, extract_key)
|
||||
.unwrap_or_else(std::convert::identity);
|
||||
|
||||
(
|
||||
VecMap {
|
||||
data: self.data[..split_idx].to_vec(),
|
||||
ordering: self.ordering,
|
||||
},
|
||||
VecMap {
|
||||
data: self.data[split_idx..].to_vec(),
|
||||
ordering: self.ordering,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
/// Move items from `other` to the end of `self`, leaving `other` empty.
|
||||
/// If the `other` ordering is different from `self` ordering
|
||||
/// `ExtendOrderingError` error will be returned.
|
||||
|
||||
@@ -15,13 +15,11 @@ anyhow.workspace = true
|
||||
axum.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
inotify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
|
||||
@@ -15,7 +15,6 @@ anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
bit_field.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
@@ -23,12 +22,9 @@ camino.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
const_format.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crc32c.workspace = true
|
||||
crossbeam-utils.workspace = true
|
||||
either.workspace = true
|
||||
flate2.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
@@ -57,10 +53,6 @@ serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
serde_path_to_error.workspace = true
|
||||
serde_with.workspace = true
|
||||
signal-hook.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
svg_fmt.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
@@ -73,7 +65,6 @@ tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||
tracing.workspace = true
|
||||
twox-hash.workspace = true
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Quantify a single walredo manager's throughput under N concurrent callers.
|
||||
//!
|
||||
//! The benchmark implementation ([`bench_impl`]) is parametrized by
|
||||
//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
|
||||
//! - `redo_work` => an async closure that takes a `PostgresRedoManager` and performs one redo
|
||||
//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
|
||||
//! - `nclients` => number of clients (more on this shortly).
|
||||
//!
|
||||
@@ -10,7 +10,7 @@
|
||||
//! Each task executes the `redo_work` `n_redos/nclients` times.
|
||||
//!
|
||||
//! We exercise the following combinations:
|
||||
//! - `redo_work = short / medium``
|
||||
//! - `redo_work = ping / short / medium``
|
||||
//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
|
||||
//!
|
||||
//! We let `criterion` determine the `n_redos` using `iter_custom`.
|
||||
@@ -27,33 +27,43 @@
|
||||
//!
|
||||
//! # Reference Numbers
|
||||
//!
|
||||
//! 2024-04-15 on i3en.3xlarge
|
||||
//! 2024-09-18 on im4gn.2xlarge
|
||||
//!
|
||||
//! ```text
|
||||
//! short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
||||
//! short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
||||
//! short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
||||
//! short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
||||
//! short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
||||
//! short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
||||
//! short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
||||
//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
||||
//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
||||
//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
||||
//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
||||
//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
||||
//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
||||
//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
||||
//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
||||
//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! ping/1 time: [21.789 µs 21.918 µs 22.078 µs]
|
||||
//! ping/2 time: [27.686 µs 27.812 µs 27.970 µs]
|
||||
//! ping/4 time: [35.468 µs 35.671 µs 35.926 µs]
|
||||
//! ping/8 time: [59.682 µs 59.987 µs 60.363 µs]
|
||||
//! ping/16 time: [101.79 µs 102.37 µs 103.08 µs]
|
||||
//! ping/32 time: [184.18 µs 185.15 µs 186.36 µs]
|
||||
//! ping/64 time: [349.86 µs 351.45 µs 353.47 µs]
|
||||
//! ping/128 time: [684.53 µs 687.98 µs 692.17 µs]
|
||||
//! short/1 time: [31.833 µs 32.126 µs 32.428 µs]
|
||||
//! short/2 time: [35.558 µs 35.756 µs 35.992 µs]
|
||||
//! short/4 time: [44.850 µs 45.138 µs 45.484 µs]
|
||||
//! short/8 time: [65.985 µs 66.379 µs 66.853 µs]
|
||||
//! short/16 time: [127.06 µs 127.90 µs 128.87 µs]
|
||||
//! short/32 time: [252.98 µs 254.70 µs 256.73 µs]
|
||||
//! short/64 time: [497.13 µs 499.86 µs 503.26 µs]
|
||||
//! short/128 time: [987.46 µs 993.45 µs 1.0004 ms]
|
||||
//! medium/1 time: [137.91 µs 138.55 µs 139.35 µs]
|
||||
//! medium/2 time: [192.00 µs 192.91 µs 194.07 µs]
|
||||
//! medium/4 time: [389.62 µs 391.55 µs 394.01 µs]
|
||||
//! medium/8 time: [776.80 µs 780.33 µs 784.77 µs]
|
||||
//! medium/16 time: [1.5323 ms 1.5383 ms 1.5459 ms]
|
||||
//! medium/32 time: [3.0120 ms 3.0226 ms 3.0350 ms]
|
||||
//! medium/64 time: [5.7405 ms 5.7787 ms 5.8166 ms]
|
||||
//! medium/128 time: [10.412 ms 10.574 ms 10.718 ms]
|
||||
//! ```
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
future::Future,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
@@ -61,40 +71,59 @@ use tokio::{sync::Barrier, task::JoinSet};
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
macro_rules! bench_group {
|
||||
($name:expr, $redo_work:expr) => {{
|
||||
let name: &str = $name;
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(name);
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
b.iter_custom(|iters| bench_impl($redo_work, iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}};
|
||||
}
|
||||
//
|
||||
// benchmark the protocol implementation
|
||||
//
|
||||
let pg_version = 14;
|
||||
bench_group!(
|
||||
"ping",
|
||||
Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
|
||||
let _: () = mgr.ping(pg_version).await.unwrap();
|
||||
})
|
||||
);
|
||||
//
|
||||
// benchmarks with actual record redo
|
||||
//
|
||||
let make_redo_work = |req: &'static Request| {
|
||||
Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
|
||||
let page = req.execute(&mgr).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
})
|
||||
};
|
||||
bench_group!("short", {
|
||||
static REQUEST: Lazy<Request> = Lazy::new(Request::short_input);
|
||||
make_redo_work(&REQUEST)
|
||||
});
|
||||
bench_group!("medium", {
|
||||
static REQUEST: Lazy<Request> = Lazy::new(Request::medium_input);
|
||||
make_redo_work(&REQUEST)
|
||||
});
|
||||
}
|
||||
criterion::criterion_group!(benches, bench);
|
||||
criterion::criterion_main!(benches);
|
||||
|
||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||
fn bench_impl<F, Fut>(redo_work: Arc<F>, n_redos: u64, nclients: u64) -> Duration
|
||||
where
|
||||
F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
|
||||
Fut: Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
@@ -135,17 +164,20 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
|
||||
})
|
||||
}
|
||||
|
||||
async fn client(
|
||||
async fn client<F, Fut>(
|
||||
mgr: Arc<PostgresRedoManager>,
|
||||
start: Arc<Barrier>,
|
||||
redo_work: Arc<Request>,
|
||||
redo_work: Arc<F>,
|
||||
n_redos: u64,
|
||||
) -> Duration {
|
||||
) -> Duration
|
||||
where
|
||||
F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
|
||||
Fut: Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
start.wait().await;
|
||||
let start = Instant::now();
|
||||
for _ in 0..n_redos {
|
||||
let page = redo_work.execute(&mgr).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
redo_work(Arc::clone(&mgr)).await;
|
||||
// The real pageserver will rarely if ever do 2 walredos in a row without
|
||||
// yielding to the executor.
|
||||
tokio::task::yield_now().await;
|
||||
|
||||
@@ -432,7 +432,7 @@ impl Client {
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
self.request(Method::POST, &uri, req)
|
||||
self.request(Method::PUT, &uri, req)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
|
||||
@@ -9,41 +9,19 @@ default = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
const_format.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crossbeam-utils.workspace = true
|
||||
either.workspace = true
|
||||
flate2.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
itertools.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
rand.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
svg_fmt.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
git-version.workspace = true
|
||||
@@ -24,5 +23,4 @@ toml_edit.workspace = true
|
||||
utils.workspace = true
|
||||
svg_fmt.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
@@ -13,7 +13,6 @@ use pageserver_api::{
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use std::env;
|
||||
use storage_broker::Uri;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::logging::SecretString;
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
@@ -33,7 +32,7 @@ use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::virtual_file;
|
||||
use crate::virtual_file::io_engine;
|
||||
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
|
||||
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME};
|
||||
|
||||
/// Global state of pageserver.
|
||||
///
|
||||
@@ -257,17 +256,6 @@ impl PageServerConf {
|
||||
.join(timeline_id.to_string())
|
||||
}
|
||||
|
||||
pub(crate) fn timeline_delete_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Utf8PathBuf {
|
||||
path_with_suffix_extension(
|
||||
self.timeline_path(&tenant_shard_id, &timeline_id),
|
||||
TIMELINE_DELETE_MARK_SUFFIX,
|
||||
)
|
||||
}
|
||||
|
||||
/// Turns storage remote path of a file into its local path.
|
||||
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
|
||||
remote_path.with_base(&self.workdir)
|
||||
@@ -336,7 +324,6 @@ impl PageServerConf {
|
||||
max_vectored_read_bytes,
|
||||
image_compression,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
compact_level0_phase1_value_access: _,
|
||||
l0_flush,
|
||||
virtual_file_direct_io,
|
||||
concurrent_tenant_warmup,
|
||||
@@ -491,11 +478,6 @@ pub struct ConfigurableSemaphore {
|
||||
}
|
||||
|
||||
impl ConfigurableSemaphore {
|
||||
pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
|
||||
Some(x) => x,
|
||||
None => panic!("const unwrap is not yet stable"),
|
||||
};
|
||||
|
||||
/// Initializse using a non-zero amount of permits.
|
||||
///
|
||||
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
|
||||
@@ -516,12 +498,6 @@ impl ConfigurableSemaphore {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ConfigurableSemaphore {
|
||||
fn default() -> Self {
|
||||
Self::new(Self::DEFAULT_INITIAL)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for ConfigurableSemaphore {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// the number of permits can be increased at runtime, so we cannot really fulfill the
|
||||
@@ -558,16 +534,6 @@ mod tests {
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compactl0_phase1_access_mode_is_ignored_silently() {
|
||||
let input = indoc::indoc! {r#"
|
||||
[compact_level0_phase1_value_access]
|
||||
mode = "streaming-kmerge"
|
||||
validate = "key-lsn-value"
|
||||
"#};
|
||||
toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
|
||||
}
|
||||
|
||||
/// If there's a typo in the pageserver config, we'd rather catch that typo
|
||||
/// and fail pageserver startup than silently ignoring the typo, leaving whoever
|
||||
/// made it in the believe that their config change is effective.
|
||||
|
||||
@@ -178,7 +178,7 @@ async fn collect_metrics(
|
||||
)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
tracing::error!("failed to upload to S3: {e:#}");
|
||||
tracing::error!("failed to upload to remote storage: {e:#}");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -2955,7 +2955,7 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|
||||
|r| api_handler(r, timeline_preserve_initdb_handler),
|
||||
)
|
||||
.post(
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
|
||||
|r| api_handler(r, timeline_archival_config_handler),
|
||||
)
|
||||
|
||||
@@ -1177,10 +1177,10 @@ pub(crate) mod virtual_file_io_engine {
|
||||
}
|
||||
|
||||
struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
global_metric: &'a Histogram,
|
||||
global_latency_histo: &'a Histogram,
|
||||
|
||||
// Optional because not all op types are tracked per-timeline
|
||||
timeline_metric: Option<&'a Histogram>,
|
||||
per_timeline_latency_histo: Option<&'a Histogram>,
|
||||
|
||||
ctx: &'c RequestContext,
|
||||
start: std::time::Instant,
|
||||
@@ -1212,9 +1212,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
self.global_metric.observe(ex_throttled.as_secs_f64());
|
||||
if let Some(timeline_metric) = self.timeline_metric {
|
||||
timeline_metric.observe(ex_throttled.as_secs_f64());
|
||||
self.global_latency_histo
|
||||
.observe(ex_throttled.as_secs_f64());
|
||||
if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
|
||||
per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1240,10 +1241,32 @@ pub enum SmgrQueryType {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
global_metrics: [Histogram; SmgrQueryType::COUNT],
|
||||
per_timeline_getpage: Histogram,
|
||||
global_started: [IntCounter; SmgrQueryType::COUNT],
|
||||
global_latency: [Histogram; SmgrQueryType::COUNT],
|
||||
per_timeline_getpage_started: IntCounter,
|
||||
per_timeline_getpage_latency: Histogram,
|
||||
}
|
||||
|
||||
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
|
||||
"pageserver_smgr_query_started_global_count",
|
||||
"Number of smgr queries started, aggregated by query type.",
|
||||
&["smgr_query_type"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
|
||||
"pageserver_smgr_query_started_count",
|
||||
"Number of smgr queries started, aggregated by query type and tenant/timeline.",
|
||||
&["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
@@ -1319,14 +1342,20 @@ impl SmgrQueryTimePerTimeline {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let global_metrics = std::array::from_fn(|i| {
|
||||
let global_started = std::array::from_fn(|i| {
|
||||
let op = SmgrQueryType::from_repr(i).unwrap();
|
||||
SMGR_QUERY_STARTED_GLOBAL
|
||||
.get_metric_with_label_values(&[op.into()])
|
||||
.unwrap()
|
||||
});
|
||||
let global_latency = std::array::from_fn(|i| {
|
||||
let op = SmgrQueryType::from_repr(i).unwrap();
|
||||
SMGR_QUERY_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[op.into()])
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||
let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
|
||||
.get_metric_with_label_values(&[
|
||||
SmgrQueryType::GetPageAtLsn.into(),
|
||||
&tenant_id,
|
||||
@@ -1334,18 +1363,32 @@ impl SmgrQueryTimePerTimeline {
|
||||
&timeline_id,
|
||||
])
|
||||
.unwrap();
|
||||
let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||
.get_metric_with_label_values(&[
|
||||
SmgrQueryType::GetPageAtLsn.into(),
|
||||
&tenant_id,
|
||||
&shard_slug,
|
||||
&timeline_id,
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
Self {
|
||||
global_metrics,
|
||||
per_timeline_getpage,
|
||||
global_started,
|
||||
global_latency,
|
||||
per_timeline_getpage_latency,
|
||||
per_timeline_getpage_started,
|
||||
}
|
||||
}
|
||||
pub(crate) fn start_timer<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
op: SmgrQueryType,
|
||||
ctx: &'c RequestContext,
|
||||
) -> Option<impl Drop + '_> {
|
||||
let global_metric = &self.global_metrics[op as usize];
|
||||
) -> Option<impl Drop + 'a> {
|
||||
let start = Instant::now();
|
||||
|
||||
self.global_started[op as usize].inc();
|
||||
|
||||
// We subtract time spent throttled from the observed latency.
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
@@ -1364,15 +1407,16 @@ impl SmgrQueryTimePerTimeline {
|
||||
}
|
||||
}
|
||||
|
||||
let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
|
||||
Some(&self.per_timeline_getpage)
|
||||
let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
|
||||
self.per_timeline_getpage_started.inc();
|
||||
Some(&self.per_timeline_getpage_latency)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Some(GlobalAndPerTimelineHistogramTimer {
|
||||
global_metric,
|
||||
timeline_metric,
|
||||
global_latency_histo: &self.global_latency[op as usize],
|
||||
per_timeline_latency_histo,
|
||||
ctx,
|
||||
start,
|
||||
op,
|
||||
@@ -1423,9 +1467,12 @@ mod smgr_query_time_tests {
|
||||
let get_counts = || {
|
||||
let global: u64 = ops
|
||||
.iter()
|
||||
.map(|op| metrics.global_metrics[*op as usize].get_sample_count())
|
||||
.map(|op| metrics.global_latency[*op as usize].get_sample_count())
|
||||
.sum();
|
||||
(global, metrics.per_timeline_getpage.get_sample_count())
|
||||
(
|
||||
global,
|
||||
metrics.per_timeline_getpage_latency.get_sample_count(),
|
||||
)
|
||||
};
|
||||
|
||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||
@@ -1487,7 +1534,7 @@ impl BasebackupQueryTime {
|
||||
pub(crate) fn start_recording<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
ctx: &'c RequestContext,
|
||||
) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
|
||||
) -> BasebackupQueryTimeOngoingRecording<'a, 'a> {
|
||||
let start = Instant::now();
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
@@ -1777,7 +1824,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
|
||||
.expect("failed to define a metric"),
|
||||
upload_heatmap_duration: register_histogram!(
|
||||
"pageserver_secondary_upload_heatmap_duration",
|
||||
"Time to build and upload a heatmap, including any waiting inside the S3 client"
|
||||
"Time to build and upload a heatmap, including any waiting inside the remote storage client"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
download_heatmap: register_int_counter!(
|
||||
@@ -2576,6 +2623,12 @@ impl TimelineMetrics {
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
|
||||
let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
SmgrQueryType::GetPageAtLsn.into(),
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
SmgrQueryType::GetPageAtLsn.into(),
|
||||
tenant_id,
|
||||
@@ -2592,6 +2645,8 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
}
|
||||
|
||||
tenant_throttling::remove_tenant_metrics(tenant_shard_id);
|
||||
|
||||
// we leave the BROKEN_TENANTS_SET entry if any
|
||||
}
|
||||
|
||||
@@ -3055,41 +3110,173 @@ pub mod tokio_epoll_uring {
|
||||
pub(crate) mod tenant_throttling {
|
||||
use metrics::{register_int_counter_vec, IntCounter};
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
use crate::tenant::{self, throttle::Metric};
|
||||
|
||||
pub(crate) struct TimelineGet {
|
||||
wait_time: IntCounter,
|
||||
count: IntCounter,
|
||||
struct GlobalAndPerTenantIntCounter {
|
||||
global: IntCounter,
|
||||
per_tenant: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
|
||||
static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_wait_usecs_sum_global",
|
||||
"Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
|
||||
impl GlobalAndPerTenantIntCounter {
|
||||
#[inline(always)]
|
||||
pub(crate) fn inc(&self) {
|
||||
self.inc_by(1)
|
||||
}
|
||||
#[inline(always)]
|
||||
pub(crate) fn inc_by(&self, n: u64) {
|
||||
self.global.inc_by(n);
|
||||
self.per_tenant.inc_by(n);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct TimelineGet {
|
||||
count_accounted_start: GlobalAndPerTenantIntCounter,
|
||||
count_accounted_finish: GlobalAndPerTenantIntCounter,
|
||||
wait_time: GlobalAndPerTenantIntCounter,
|
||||
count_throttled: GlobalAndPerTenantIntCounter,
|
||||
}
|
||||
|
||||
static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_accounted_start_global",
|
||||
"Count of tenant throttling starts, by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_global",
|
||||
"Count of tenant throttlings, by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let kind = "timeline_get";
|
||||
TimelineGet {
|
||||
wait_time: WAIT_USECS.with_label_values(&[kind]),
|
||||
count: WAIT_COUNT.with_label_values(&[kind]),
|
||||
}
|
||||
.unwrap()
|
||||
});
|
||||
static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_accounted_start",
|
||||
"Count of tenant throttling starts, by kind of throttle.",
|
||||
&["kind", "tenant_id", "shard_id"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_accounted_finish_global",
|
||||
"Count of tenant throttling finishes, by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_accounted_finish",
|
||||
"Count of tenant throttling finishes, by kind of throttle.",
|
||||
&["kind", "tenant_id", "shard_id"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_wait_usecs_sum_global",
|
||||
"Sum of microseconds that spent waiting throttle by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_wait_usecs_sum",
|
||||
"Sum of microseconds that spent waiting throttle by kind of throttle.",
|
||||
&["kind", "tenant_id", "shard_id"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
impl Metric for &'static TimelineGet {
|
||||
static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_global",
|
||||
"Count of tenant throttlings, by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count",
|
||||
"Count of tenant throttlings, by kind of throttle.",
|
||||
&["kind", "tenant_id", "shard_id"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
const KIND: &str = "timeline_get";
|
||||
|
||||
impl TimelineGet {
|
||||
pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
|
||||
let per_tenant_label_values = &[
|
||||
KIND,
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
];
|
||||
TimelineGet {
|
||||
count_accounted_start: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
|
||||
per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
},
|
||||
count_accounted_finish: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
|
||||
per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
},
|
||||
wait_time: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: WAIT_USECS.with_label_values(&[KIND]),
|
||||
per_tenant: WAIT_USECS_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
},
|
||||
count_throttled: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: WAIT_COUNT.with_label_values(&[KIND]),
|
||||
per_tenant: WAIT_COUNT_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn preinitialize_global_metrics() {
|
||||
Lazy::force(&COUNT_ACCOUNTED_START);
|
||||
Lazy::force(&COUNT_ACCOUNTED_FINISH);
|
||||
Lazy::force(&WAIT_USECS);
|
||||
Lazy::force(&WAIT_COUNT);
|
||||
}
|
||||
|
||||
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
||||
for m in &[
|
||||
&COUNT_ACCOUNTED_START_PER_TENANT,
|
||||
&COUNT_ACCOUNTED_FINISH_PER_TENANT,
|
||||
&WAIT_USECS_PER_TENANT,
|
||||
&WAIT_COUNT_PER_TENANT,
|
||||
] {
|
||||
let _ = m.remove_label_values(&[
|
||||
KIND,
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
impl Metric for TimelineGet {
|
||||
#[inline(always)]
|
||||
fn accounting_start(&self) {
|
||||
self.count_accounted_start.inc();
|
||||
}
|
||||
#[inline(always)]
|
||||
fn accounting_finish(&self) {
|
||||
self.count_accounted_finish.inc();
|
||||
}
|
||||
#[inline(always)]
|
||||
fn observe_throttling(
|
||||
&self,
|
||||
@@ -3097,7 +3284,7 @@ pub(crate) mod tenant_throttling {
|
||||
) {
|
||||
let val = u64::try_from(wait_time.as_micros()).unwrap();
|
||||
self.wait_time.inc_by(val);
|
||||
self.count.inc();
|
||||
self.count_throttled.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3227,11 +3414,14 @@ pub fn preinitialize_metrics() {
|
||||
}
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
Lazy::force(c);
|
||||
});
|
||||
[
|
||||
&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
|
||||
&SMGR_QUERY_STARTED_GLOBAL,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
Lazy::force(c);
|
||||
});
|
||||
|
||||
// gauges
|
||||
WALRECEIVER_ACTIVE_MANAGERS.get();
|
||||
@@ -3253,7 +3443,8 @@ pub fn preinitialize_metrics() {
|
||||
|
||||
// Custom
|
||||
Lazy::force(&RECONSTRUCT_TIME);
|
||||
Lazy::force(&tenant_throttling::TIMELINE_GET);
|
||||
Lazy::force(&BASEBACKUP_QUERY_TIME);
|
||||
Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
|
||||
|
||||
tenant_throttling::preinitialize_global_metrics();
|
||||
}
|
||||
|
||||
@@ -840,6 +840,36 @@ impl Timeline {
|
||||
Ok(total_size * BLCKSZ as u64)
|
||||
}
|
||||
|
||||
/// Get a KeySpace that covers all the Keys that are in use at AND below the given LSN. This is only used
|
||||
/// for gc-compaction.
|
||||
///
|
||||
/// gc-compaction cannot use the same `collect_keyspace` function as the legacy compaction because it
|
||||
/// processes data at multiple LSNs and needs to be aware of the fact that some key ranges might need to
|
||||
/// be kept only for a specific range of LSN.
|
||||
///
|
||||
/// Consider the case that the user created branches at LSN 10 and 20, where the user created a table A at
|
||||
/// LSN 10 and dropped that table at LSN 20. `collect_keyspace` at LSN 10 will return the key range
|
||||
/// corresponding to that table, while LSN 20 won't. The keyspace info at a single LSN is not enough to
|
||||
/// determine which keys to retain/drop for gc-compaction.
|
||||
///
|
||||
/// For now, it only drops AUX-v1 keys. But in the future, the function will be extended to return the keyspace
|
||||
/// to be retained for each of the branch LSN.
|
||||
///
|
||||
/// The return value is (dense keyspace, sparse keyspace).
|
||||
pub(crate) async fn collect_gc_compaction_keyspace(
|
||||
&self,
|
||||
) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
|
||||
let metadata_key_begin = Key::metadata_key_range().start;
|
||||
let aux_v1_key = AUX_FILES_KEY;
|
||||
let dense_keyspace = KeySpace {
|
||||
ranges: vec![Key::MIN..aux_v1_key, aux_v1_key.next()..metadata_key_begin],
|
||||
};
|
||||
Ok((
|
||||
dense_keyspace,
|
||||
SparseKeySpace(KeySpace::single(Key::metadata_key_range())),
|
||||
))
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
||||
/// Anything that's not listed maybe removed from the underlying storage (from
|
||||
|
||||
@@ -18,7 +18,6 @@ use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use enumset::EnumSet;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
@@ -34,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
use std::sync::Weak;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
@@ -140,6 +140,7 @@ pub mod metadata;
|
||||
pub mod remote_timeline_client;
|
||||
pub mod storage_layer;
|
||||
|
||||
pub mod checks;
|
||||
pub mod config;
|
||||
pub mod mgr;
|
||||
pub mod secondary;
|
||||
@@ -301,7 +302,7 @@ pub struct Tenant {
|
||||
/// Throttle applied at the top of [`Timeline::get`].
|
||||
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
|
||||
pub(crate) timeline_get_throttle:
|
||||
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
|
||||
/// An ongoing timeline detach concurrency limiter.
|
||||
///
|
||||
@@ -1030,13 +1031,9 @@ impl Tenant {
|
||||
}
|
||||
|
||||
Ok(TenantPreload {
|
||||
timelines: Self::load_timeline_metadata(
|
||||
self,
|
||||
remote_timeline_ids,
|
||||
remote_storage,
|
||||
cancel,
|
||||
)
|
||||
.await?,
|
||||
timelines: self
|
||||
.load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
|
||||
.await?,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1302,7 +1299,7 @@ impl Tenant {
|
||||
.await
|
||||
}
|
||||
|
||||
async fn load_timeline_metadata(
|
||||
async fn load_timelines_metadata(
|
||||
self: &Arc<Tenant>,
|
||||
timeline_ids: HashSet<TimelineId>,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
@@ -1310,33 +1307,10 @@ impl Tenant {
|
||||
) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
|
||||
let mut part_downloads = JoinSet::new();
|
||||
for timeline_id in timeline_ids {
|
||||
let client = RemoteTimelineClient::new(
|
||||
remote_storage.clone(),
|
||||
self.deletion_queue_client.clone(),
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
timeline_id,
|
||||
self.generation,
|
||||
);
|
||||
let cancel_clone = cancel.clone();
|
||||
part_downloads.spawn(
|
||||
async move {
|
||||
debug!("starting index part download");
|
||||
|
||||
let index_part = client.download_index_file(&cancel_clone).await;
|
||||
|
||||
debug!("finished index part download");
|
||||
|
||||
Result::<_, anyhow::Error>::Ok(TimelinePreload {
|
||||
client,
|
||||
timeline_id,
|
||||
index_part,
|
||||
})
|
||||
}
|
||||
.map(move |res| {
|
||||
res.with_context(|| format!("download index part for timeline {timeline_id}"))
|
||||
})
|
||||
.instrument(info_span!("download_index_part", %timeline_id)),
|
||||
self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
|
||||
.instrument(info_span!("download_index_part", %timeline_id)),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1347,8 +1321,7 @@ impl Tenant {
|
||||
next = part_downloads.join_next() => {
|
||||
match next {
|
||||
Some(result) => {
|
||||
let preload_result = result.context("join preload task")?;
|
||||
let preload = preload_result?;
|
||||
let preload = result.context("join preload task")?;
|
||||
timeline_preloads.insert(preload.timeline_id, preload);
|
||||
},
|
||||
None => {
|
||||
@@ -1365,6 +1338,36 @@ impl Tenant {
|
||||
Ok(timeline_preloads)
|
||||
}
|
||||
|
||||
fn load_timeline_metadata(
|
||||
self: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
cancel: CancellationToken,
|
||||
) -> impl Future<Output = TimelinePreload> {
|
||||
let client = RemoteTimelineClient::new(
|
||||
remote_storage.clone(),
|
||||
self.deletion_queue_client.clone(),
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
timeline_id,
|
||||
self.generation,
|
||||
);
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
debug!("starting index part download");
|
||||
|
||||
let index_part = client.download_index_file(&cancel).await;
|
||||
|
||||
debug!("finished index part download");
|
||||
|
||||
TimelinePreload {
|
||||
client,
|
||||
timeline_id,
|
||||
index_part,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn apply_timeline_archival_config(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
@@ -1573,6 +1576,9 @@ impl Tenant {
|
||||
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
|
||||
end_lsn: Lsn,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
use checks::check_valid_layermap;
|
||||
use itertools::Itertools;
|
||||
|
||||
let tline = self
|
||||
.create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
|
||||
.await?;
|
||||
@@ -1587,6 +1593,18 @@ impl Tenant {
|
||||
.force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
|
||||
.await?;
|
||||
}
|
||||
let layer_names = tline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.unwrap()
|
||||
.iter_historic_layers()
|
||||
.map(|layer| layer.layer_name())
|
||||
.collect_vec();
|
||||
if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
bail!("invalid layermap: {err}");
|
||||
}
|
||||
Ok(tline)
|
||||
}
|
||||
|
||||
@@ -1950,9 +1968,6 @@ impl Tenant {
|
||||
TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
|
||||
}
|
||||
TenantState::Loading => {
|
||||
*current_state = TenantState::Activating(ActivatingFrom::Loading);
|
||||
}
|
||||
TenantState::Attaching => {
|
||||
*current_state = TenantState::Activating(ActivatingFrom::Attaching);
|
||||
}
|
||||
@@ -2133,7 +2148,7 @@ impl Tenant {
|
||||
async fn set_stopping(
|
||||
&self,
|
||||
progress: completion::Barrier,
|
||||
allow_transition_from_loading: bool,
|
||||
_allow_transition_from_loading: bool,
|
||||
allow_transition_from_attaching: bool,
|
||||
) -> Result<(), SetStoppingError> {
|
||||
let mut rx = self.state.subscribe();
|
||||
@@ -2148,7 +2163,6 @@ impl Tenant {
|
||||
);
|
||||
false
|
||||
}
|
||||
TenantState::Loading => allow_transition_from_loading,
|
||||
TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
|
||||
})
|
||||
.await
|
||||
@@ -2167,13 +2181,6 @@ impl Tenant {
|
||||
*current_state = TenantState::Stopping { progress };
|
||||
true
|
||||
}
|
||||
TenantState::Loading => {
|
||||
if !allow_transition_from_loading {
|
||||
unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
|
||||
};
|
||||
*current_state = TenantState::Stopping { progress };
|
||||
true
|
||||
}
|
||||
TenantState::Active => {
|
||||
// FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
|
||||
// are created after the transition to Stopping. That's harmless, as the Timelines
|
||||
@@ -2229,7 +2236,7 @@ impl Tenant {
|
||||
// The load & attach routines own the tenant state until it has reached `Active`.
|
||||
// So, wait until it's done.
|
||||
rx.wait_for(|state| match state {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
info!(
|
||||
"waiting for {} to turn Active|Broken|Stopping",
|
||||
<&'static str>::from(state)
|
||||
@@ -2249,7 +2256,7 @@ impl Tenant {
|
||||
let reason = reason.to_string();
|
||||
self.state.send_modify(|current_state| {
|
||||
match *current_state {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
|
||||
}
|
||||
TenantState::Active => {
|
||||
@@ -2293,7 +2300,7 @@ impl Tenant {
|
||||
loop {
|
||||
let current_state = receiver.borrow_and_update().clone();
|
||||
match current_state {
|
||||
TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
|
||||
TenantState::Attaching | TenantState::Activating(_) => {
|
||||
// in these states, there's a chance that we can reach ::Active
|
||||
self.activate_now();
|
||||
match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
|
||||
@@ -2815,7 +2822,7 @@ impl Tenant {
|
||||
gate: Gate::default(),
|
||||
timeline_get_throttle: Arc::new(throttle::Throttle::new(
|
||||
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
|
||||
&crate::metrics::tenant_throttling::TIMELINE_GET,
|
||||
crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
|
||||
)),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
@@ -3197,6 +3204,9 @@ impl Tenant {
|
||||
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
|
||||
end_lsn: Lsn,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
use checks::check_valid_layermap;
|
||||
use itertools::Itertools;
|
||||
|
||||
let tline = self
|
||||
.branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
|
||||
.await?;
|
||||
@@ -3217,6 +3227,18 @@ impl Tenant {
|
||||
.force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
|
||||
.await?;
|
||||
}
|
||||
let layer_names = tline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.unwrap()
|
||||
.iter_historic_layers()
|
||||
.map(|layer| layer.layer_name())
|
||||
.collect_vec();
|
||||
if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
bail!("invalid layermap: {err}");
|
||||
}
|
||||
Ok(tline)
|
||||
}
|
||||
|
||||
@@ -3594,7 +3616,7 @@ impl Tenant {
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> anyhow::Result<UninitializedTimeline> {
|
||||
) -> anyhow::Result<UninitializedTimeline<'a>> {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
|
||||
let resources = self.build_timeline_resources(new_timeline_id);
|
||||
@@ -4111,7 +4133,7 @@ pub(crate) mod harness {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
TenantState::Loading,
|
||||
TenantState::Attaching,
|
||||
self.conf,
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt::from(self.tenant_conf.clone()),
|
||||
@@ -4164,9 +4186,18 @@ pub(crate) mod harness {
|
||||
let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
|
||||
if records_neon {
|
||||
// For Neon wal records, we can decode without spawning postgres, so do so.
|
||||
let base_img = base_img.expect("Neon WAL redo requires base image").1;
|
||||
let mut page = BytesMut::new();
|
||||
page.extend_from_slice(&base_img);
|
||||
let mut page = match (base_img, records.first()) {
|
||||
(Some((_lsn, img)), _) => {
|
||||
let mut page = BytesMut::new();
|
||||
page.extend_from_slice(&img);
|
||||
page
|
||||
}
|
||||
(_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(),
|
||||
_ => {
|
||||
panic!("Neon WAL redo requires base image or will init record");
|
||||
}
|
||||
};
|
||||
|
||||
for (record_lsn, record) in records {
|
||||
apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
|
||||
}
|
||||
@@ -8492,6 +8523,7 @@ mod tests {
|
||||
let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let will_init_keys = [2, 6];
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
@@ -8541,18 +8573,25 @@ mod tests {
|
||||
}
|
||||
};
|
||||
|
||||
let delta = format!("@{lsn}");
|
||||
delta_layer_spec.push((
|
||||
key,
|
||||
lsn,
|
||||
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
|
||||
));
|
||||
delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn);
|
||||
let will_init = will_init_keys.contains(&i);
|
||||
if will_init {
|
||||
delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
|
||||
|
||||
expected_key_values
|
||||
.get_mut(&key)
|
||||
.expect("An image exists for each key")
|
||||
.push_str(delta.as_str());
|
||||
expected_key_values.insert(key, "".to_string());
|
||||
} else {
|
||||
let delta = format!("@{lsn}");
|
||||
delta_layer_spec.push((
|
||||
key,
|
||||
lsn,
|
||||
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
|
||||
));
|
||||
|
||||
expected_key_values
|
||||
.get_mut(&key)
|
||||
.expect("An image exists for each key")
|
||||
.push_str(delta.as_str());
|
||||
}
|
||||
delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn);
|
||||
}
|
||||
|
||||
delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1);
|
||||
|
||||
55
pageserver/src/tenant/checks.rs
Normal file
55
pageserver/src/tenant/checks.rs
Normal file
@@ -0,0 +1,55 @@
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::storage_layer::LayerName;
|
||||
|
||||
/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
|
||||
/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
|
||||
///
|
||||
/// ```plain
|
||||
/// | | | |
|
||||
/// | 1 | | 2 | | 3 |
|
||||
/// | | | | | |
|
||||
/// ```
|
||||
///
|
||||
/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
|
||||
/// the same LSN range.
|
||||
///
|
||||
/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
|
||||
///
|
||||
/// ```plain
|
||||
/// | | | 2 | | |
|
||||
/// | 1 | |-------| | 3 |
|
||||
/// | | | 4 | | |
|
||||
///
|
||||
/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
|
||||
pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
|
||||
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
|
||||
let mut all_delta_layers = Vec::new();
|
||||
for name in metadata {
|
||||
if let LayerName::Delta(layer) = name {
|
||||
if layer.key_range.start.next() != layer.key_range.end {
|
||||
all_delta_layers.push(layer.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
for layer in &all_delta_layers {
|
||||
let lsn_range = &layer.lsn_range;
|
||||
lsn_split_point.insert(lsn_range.start);
|
||||
lsn_split_point.insert(lsn_range.end);
|
||||
}
|
||||
for layer in &all_delta_layers {
|
||||
let lsn_range = layer.lsn_range.clone();
|
||||
let intersects = lsn_split_point.range(lsn_range).collect_vec();
|
||||
if intersects.len() > 1 {
|
||||
let err = format!(
|
||||
"layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
|
||||
layer,
|
||||
intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
|
||||
);
|
||||
return Some(err);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
@@ -1,11 +1,29 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
|
||||
use super::remote_timeline_client::index::GcBlockingReason;
|
||||
use tokio::time::Instant;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
|
||||
type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
|
||||
|
||||
#[derive(Default)]
|
||||
struct Storage {
|
||||
timelines_blocked: TimelinesBlocked,
|
||||
/// The deadline before which we are blocked from GC so that
|
||||
/// leases have a chance to be renewed.
|
||||
lsn_lease_deadline: Option<Instant>,
|
||||
}
|
||||
|
||||
impl Storage {
|
||||
fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
|
||||
self.lsn_lease_deadline
|
||||
.map(|d| Instant::now() < d)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
|
||||
/// blocking.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct GcBlock {
|
||||
/// The timelines which have current reasons to block gc.
|
||||
@@ -13,6 +31,12 @@ pub(crate) struct GcBlock {
|
||||
/// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
|
||||
/// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
|
||||
reasons: std::sync::Mutex<Storage>,
|
||||
|
||||
/// GC background task or manually run `Tenant::gc_iteration` holds a lock on this.
|
||||
///
|
||||
/// Do not add any more features taking and forbidding taking this lock. It should be
|
||||
/// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
|
||||
/// synchronizes with gc attempts by locking and unlocking this mutex.
|
||||
blocking: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
@@ -42,6 +66,20 @@ impl GcBlock {
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets a deadline before which we cannot proceed to GC due to lsn lease.
|
||||
///
|
||||
/// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
|
||||
/// length, we guarantee that all the leases we granted before will have a chance to renew
|
||||
/// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
|
||||
pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
|
||||
let deadline = Instant::now() + lsn_lease_length;
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
g.lsn_lease_deadline = Some(deadline);
|
||||
}
|
||||
|
||||
/// Describe the current gc blocking reasons.
|
||||
///
|
||||
/// TODO: make this json serializable.
|
||||
pub(crate) fn summary(&self) -> Option<BlockingReasons> {
|
||||
let g = self.reasons.lock().unwrap();
|
||||
|
||||
@@ -64,7 +102,7 @@ impl GcBlock {
|
||||
) -> anyhow::Result<bool> {
|
||||
let (added, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
let set = g.entry(timeline.timeline_id).or_default();
|
||||
let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
|
||||
let added = set.insert(reason);
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock, see self.reasons.
|
||||
@@ -95,7 +133,7 @@ impl GcBlock {
|
||||
|
||||
let (remaining_blocks, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
match g.entry(timeline.timeline_id) {
|
||||
match g.timelines_blocked.entry(timeline.timeline_id) {
|
||||
Entry::Occupied(mut oe) => {
|
||||
let set = oe.get_mut();
|
||||
set.remove(reason);
|
||||
@@ -109,7 +147,7 @@ impl GcBlock {
|
||||
}
|
||||
}
|
||||
|
||||
let remaining_blocks = g.len();
|
||||
let remaining_blocks = g.timelines_blocked.len();
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
|
||||
let uploaded = timeline
|
||||
@@ -134,11 +172,11 @@ impl GcBlock {
|
||||
pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
|
||||
let unblocked = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
if g.is_empty() {
|
||||
if g.timelines_blocked.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
g.remove(&timeline.timeline_id);
|
||||
g.timelines_blocked.remove(&timeline.timeline_id);
|
||||
|
||||
BlockingReasons::clean_and_summarize(g).is_none()
|
||||
};
|
||||
@@ -149,10 +187,11 @@ impl GcBlock {
|
||||
}
|
||||
|
||||
/// Initialize with the non-deleted timelines of this tenant.
|
||||
pub(crate) fn set_scanned(&self, scanned: Storage) {
|
||||
pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
assert!(g.is_empty());
|
||||
g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
|
||||
assert!(g.timelines_blocked.is_empty());
|
||||
g.timelines_blocked
|
||||
.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
|
||||
|
||||
if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
|
||||
tracing::info!(summary=?reasons, "initialized with gc blocked");
|
||||
@@ -166,6 +205,7 @@ pub(super) struct Guard<'a> {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct BlockingReasons {
|
||||
tenant_blocked_by_lsn_lease_deadline: bool,
|
||||
timelines: usize,
|
||||
reasons: enumset::EnumSet<GcBlockingReason>,
|
||||
}
|
||||
@@ -174,8 +214,8 @@ impl std::fmt::Display for BlockingReasons {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{} timelines block for {:?}",
|
||||
self.timelines, self.reasons
|
||||
"tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
|
||||
self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -183,13 +223,15 @@ impl std::fmt::Display for BlockingReasons {
|
||||
impl BlockingReasons {
|
||||
fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
let mut reasons = enumset::EnumSet::empty();
|
||||
g.retain(|_key, value| {
|
||||
g.timelines_blocked.retain(|_key, value| {
|
||||
reasons = reasons.union(*value);
|
||||
!value.is_empty()
|
||||
});
|
||||
if !g.is_empty() {
|
||||
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
|
||||
if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
|
||||
Some(BlockingReasons {
|
||||
timelines: g.len(),
|
||||
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
|
||||
timelines: g.timelines_blocked.len(),
|
||||
reasons,
|
||||
})
|
||||
} else {
|
||||
@@ -198,14 +240,17 @@ impl BlockingReasons {
|
||||
}
|
||||
|
||||
fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
if g.is_empty() {
|
||||
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
|
||||
if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
|
||||
None
|
||||
} else {
|
||||
let reasons = g
|
||||
.timelines_blocked
|
||||
.values()
|
||||
.fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
|
||||
Some(BlockingReasons {
|
||||
timelines: g.len(),
|
||||
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
|
||||
timelines: g.timelines_blocked.len(),
|
||||
reasons,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -949,6 +949,12 @@ impl TenantManager {
|
||||
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
|
||||
match attach_conf.generation.cmp(&tenant.generation) {
|
||||
Ordering::Equal => {
|
||||
if attach_conf.attach_mode == AttachmentMode::Single {
|
||||
tenant
|
||||
.gc_block
|
||||
.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
|
||||
}
|
||||
|
||||
// A transition from Attached to Attached in the same generation, we may
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
//! Common traits and structs for layers
|
||||
|
||||
pub mod delta_layer;
|
||||
pub mod filter_iterator;
|
||||
pub mod image_layer;
|
||||
pub mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
@@ -276,10 +276,20 @@ pub(crate) enum LayerId {
|
||||
InMemoryLayerId(InMemoryLayerFileId),
|
||||
}
|
||||
|
||||
/// Uniquely identify a layer visit by the layer
|
||||
/// and LSN floor (or start LSN) of the reads.
|
||||
/// The layer itself is not enough since we may
|
||||
/// have different LSN lower bounds for delta layer reads.
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
struct LayerToVisitId {
|
||||
layer_id: LayerId,
|
||||
lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
/// Layer wrapper for the read path. Note that it is valid
|
||||
/// to use these layers even after external operations have
|
||||
/// been performed on them (compaction, freeze, etc.).
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum ReadableLayer {
|
||||
PersistentLayer(Layer),
|
||||
InMemoryLayer(Arc<InMemoryLayer>),
|
||||
@@ -287,13 +297,11 @@ pub(crate) enum ReadableLayer {
|
||||
|
||||
/// A partial description of a read to be done.
|
||||
#[derive(Debug, Clone)]
|
||||
struct ReadDesc {
|
||||
struct LayerVisit {
|
||||
/// An id used to resolve the readable layer within the fringe
|
||||
layer_id: LayerId,
|
||||
layer_to_visit_id: LayerToVisitId,
|
||||
/// Lsn range for the read, used for selecting the next read
|
||||
lsn_range: Range<Lsn>,
|
||||
/// This read's index in [`LayerKeyspace::reads`];
|
||||
read_id: LayerKeyspaceReadId,
|
||||
}
|
||||
|
||||
/// Data structure which maintains a fringe of layers for the
|
||||
@@ -305,52 +313,46 @@ struct ReadDesc {
|
||||
/// a two layer indexing scheme.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct LayerFringe {
|
||||
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
|
||||
layers: HashMap<LayerId, LayerKeyspace>,
|
||||
planned_visits_by_lsn: BinaryHeap<LayerVisit>,
|
||||
visit_reads: HashMap<LayerToVisitId, LayerVisitReads>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct LayerKeyspace {
|
||||
struct LayerVisitReads {
|
||||
layer: ReadableLayer,
|
||||
next_read_id: LayerKeyspaceReadId,
|
||||
reads: HashMap<LayerKeyspaceReadId, (Range<Lsn>, KeySpace)>,
|
||||
target_keyspace: KeySpaceRandomAccum,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
|
||||
struct LayerKeyspaceReadId(usize);
|
||||
|
||||
impl LayerFringe {
|
||||
pub(crate) fn new() -> Self {
|
||||
LayerFringe {
|
||||
planned_reads_by_lsn: BinaryHeap::new(),
|
||||
layers: HashMap::new(),
|
||||
planned_visits_by_lsn: BinaryHeap::new(),
|
||||
visit_reads: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
|
||||
let read_desc = match self.planned_reads_by_lsn.pop() {
|
||||
let read_desc = match self.planned_visits_by_lsn.pop() {
|
||||
Some(desc) => desc,
|
||||
None => return None,
|
||||
};
|
||||
|
||||
let mut entry = match self.layers.entry(read_desc.layer_id) {
|
||||
Entry::Occupied(o) => o,
|
||||
Entry::Vacant(_) => unreachable!("fringe internals are always consistent"),
|
||||
};
|
||||
let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id);
|
||||
|
||||
let (lsn_range, keyspace) = entry
|
||||
.get_mut()
|
||||
.reads
|
||||
.remove(&read_desc.read_id)
|
||||
.expect("fringe internals are always consistent");
|
||||
|
||||
let layer = entry.get().layer.clone();
|
||||
|
||||
if entry.get().reads.is_empty() {
|
||||
entry.remove();
|
||||
match removed {
|
||||
Some((
|
||||
_,
|
||||
LayerVisitReads {
|
||||
layer,
|
||||
mut target_keyspace,
|
||||
},
|
||||
)) => Some((
|
||||
layer,
|
||||
target_keyspace.consume_keyspace(),
|
||||
read_desc.lsn_range,
|
||||
)),
|
||||
None => unreachable!("fringe internals are always consistent"),
|
||||
}
|
||||
|
||||
Some((layer, keyspace, lsn_range))
|
||||
}
|
||||
|
||||
pub(crate) fn update(
|
||||
@@ -359,35 +361,26 @@ impl LayerFringe {
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
) {
|
||||
let layer_id = layer.id();
|
||||
let entry = self.layers.entry(layer_id.clone());
|
||||
let layer_to_visit_id = LayerToVisitId {
|
||||
layer_id: layer.id(),
|
||||
lsn_floor: lsn_range.start,
|
||||
};
|
||||
|
||||
let entry = self.visit_reads.entry(layer_to_visit_id.clone());
|
||||
match entry {
|
||||
Entry::Occupied(mut entry) => {
|
||||
let read_id = {
|
||||
let r = &mut entry.get_mut().next_read_id;
|
||||
let read_id = *r;
|
||||
*r = LayerKeyspaceReadId(r.0 + 1);
|
||||
read_id
|
||||
};
|
||||
self.planned_reads_by_lsn.push(ReadDesc {
|
||||
lsn_range: lsn_range.clone(),
|
||||
layer_id: layer_id.clone(),
|
||||
read_id,
|
||||
});
|
||||
let replaced = entry.get_mut().reads.insert(read_id, (lsn_range, keyspace));
|
||||
assert!(replaced.is_none());
|
||||
entry.get_mut().target_keyspace.add_keyspace(keyspace);
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
let read_id = LayerKeyspaceReadId(0);
|
||||
self.planned_reads_by_lsn.push(ReadDesc {
|
||||
lsn_range: lsn_range.clone(),
|
||||
layer_id: layer_id.clone(),
|
||||
read_id,
|
||||
self.planned_visits_by_lsn.push(LayerVisit {
|
||||
lsn_range,
|
||||
layer_to_visit_id: layer_to_visit_id.clone(),
|
||||
});
|
||||
entry.insert(LayerKeyspace {
|
||||
let mut accum = KeySpaceRandomAccum::new();
|
||||
accum.add_keyspace(keyspace);
|
||||
entry.insert(LayerVisitReads {
|
||||
layer,
|
||||
next_read_id: LayerKeyspaceReadId(1),
|
||||
reads: [(read_id, (lsn_range, keyspace))].into(),
|
||||
target_keyspace: accum,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -400,7 +393,7 @@ impl Default for LayerFringe {
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for ReadDesc {
|
||||
impl Ord for LayerVisit {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
|
||||
if ord == std::cmp::Ordering::Equal {
|
||||
@@ -411,19 +404,19 @@ impl Ord for ReadDesc {
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for ReadDesc {
|
||||
impl PartialOrd for LayerVisit {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for ReadDesc {
|
||||
impl PartialEq for LayerVisit {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.lsn_range == other.lsn_range
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for ReadDesc {}
|
||||
impl Eq for LayerVisit {}
|
||||
|
||||
impl ReadableLayer {
|
||||
pub(crate) fn id(&self) -> LayerId {
|
||||
|
||||
@@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{
|
||||
use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadCoalesceMode, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
@@ -1021,13 +1021,30 @@ impl DeltaLayerInner {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter().rev() {
|
||||
if Some(meta.meta.key) == ignore_key_with_err {
|
||||
continue;
|
||||
}
|
||||
let blob_read = meta.read(&view).await;
|
||||
let blob_read = match blob_read {
|
||||
Ok(buf) => buf,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to decompress blob from virtual file {}",
|
||||
self.file.path,
|
||||
))),
|
||||
);
|
||||
|
||||
ignore_key_with_err = Some(meta.meta.key);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let value = Value::des(&blob_read);
|
||||
|
||||
let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
|
||||
let value = match value {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
@@ -1243,21 +1260,21 @@ impl DeltaLayerInner {
|
||||
buf.reserve(read.size());
|
||||
let res = reader.read_blobs(&read, buf, ctx).await?;
|
||||
|
||||
let view = BufView::new_slice(&res.buf);
|
||||
|
||||
for blob in res.blobs {
|
||||
let key = blob.meta.key;
|
||||
let lsn = blob.meta.lsn;
|
||||
let data = &res.buf[blob.start..blob.end];
|
||||
|
||||
let data = blob.read(&view).await?;
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
Value::des(data)
|
||||
Value::des(&data)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"blob failed to deserialize for {}@{}, {}..{}: {:?}",
|
||||
blob.meta.key,
|
||||
blob.meta.lsn,
|
||||
blob.start,
|
||||
blob.end,
|
||||
utils::Hex(data)
|
||||
"blob failed to deserialize for {}: {:?}",
|
||||
blob,
|
||||
utils::Hex(&data)
|
||||
)
|
||||
})
|
||||
.unwrap();
|
||||
@@ -1265,15 +1282,15 @@ impl DeltaLayerInner {
|
||||
// is it an image or will_init walrecord?
|
||||
// FIXME: this could be handled by threading the BlobRef to the
|
||||
// VectoredReadBuilder
|
||||
let will_init = crate::repository::ValueBytes::will_init(data)
|
||||
let will_init = crate::repository::ValueBytes::will_init(&data)
|
||||
.inspect_err(|_e| {
|
||||
#[cfg(feature = "testing")]
|
||||
tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
})
|
||||
.unwrap_or(false);
|
||||
|
||||
per_blob_copy.clear();
|
||||
per_blob_copy.extend_from_slice(data);
|
||||
per_blob_copy.extend_from_slice(&data);
|
||||
|
||||
let (tmp, res) = writer
|
||||
.put_value_bytes(
|
||||
@@ -1538,8 +1555,11 @@ impl<'a> DeltaLayerIterator<'a> {
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let value = Value::des(&frozen_buf[meta.start..meta.end])?;
|
||||
let blob_read = meta.read(&view).await?;
|
||||
let value = Value::des(&blob_read)?;
|
||||
|
||||
next_batch.push_back((meta.meta.key, meta.meta.lsn, value));
|
||||
}
|
||||
self.key_values_batch = next_batch;
|
||||
@@ -1916,9 +1936,13 @@ pub(crate) mod test {
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
|
||||
.await?;
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let value = &blobs_buf.buf[meta.start..meta.end];
|
||||
assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
|
||||
let value = meta.read(&view).await?;
|
||||
assert_eq!(
|
||||
&value[..],
|
||||
&entries_meta.index[&(meta.meta.key, meta.meta.lsn)]
|
||||
);
|
||||
}
|
||||
|
||||
buf = Some(blobs_buf.buf);
|
||||
|
||||
205
pageserver/src/tenant/storage_layer/filter_iterator.rs
Normal file
205
pageserver/src/tenant/storage_layer/filter_iterator.rs
Normal file
@@ -0,0 +1,205 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use anyhow::bail;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
keyspace::{KeySpace, SparseKeySpace},
|
||||
};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::repository::Value;
|
||||
|
||||
use super::merge_iterator::MergeIterator;
|
||||
|
||||
/// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
|
||||
///
|
||||
/// The iterator will skip any keys not included in the keyspace filter. In other words, the keyspace filter contains the keys
|
||||
/// to be retained.
|
||||
pub struct FilterIterator<'a> {
|
||||
inner: MergeIterator<'a>,
|
||||
retain_key_filters: Vec<Range<Key>>,
|
||||
current_filter_idx: usize,
|
||||
}
|
||||
|
||||
impl<'a> FilterIterator<'a> {
|
||||
pub fn create(
|
||||
inner: MergeIterator<'a>,
|
||||
dense_keyspace: KeySpace,
|
||||
sparse_keyspace: SparseKeySpace,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut retain_key_filters = Vec::new();
|
||||
retain_key_filters.extend(dense_keyspace.ranges);
|
||||
retain_key_filters.extend(sparse_keyspace.0.ranges);
|
||||
retain_key_filters.sort_by(|a, b| a.start.cmp(&b.start));
|
||||
// Verify key filters are non-overlapping and sorted
|
||||
for window in retain_key_filters.windows(2) {
|
||||
if window[0].end > window[1].start {
|
||||
bail!(
|
||||
"Key filters are overlapping: {:?} and {:?}",
|
||||
window[0],
|
||||
window[1]
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(Self {
|
||||
inner,
|
||||
retain_key_filters,
|
||||
current_filter_idx: 0,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
while let Some(item) = self.inner.next().await? {
|
||||
while self.current_filter_idx < self.retain_key_filters.len()
|
||||
&& item.0 >= self.retain_key_filters[self.current_filter_idx].end
|
||||
{
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
self.current_filter_idx += 1;
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
}
|
||||
if self.current_filter_idx >= self.retain_key_filters.len() {
|
||||
// We already exhausted all filters, so we should return now
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter (nothing)
|
||||
return Ok(None);
|
||||
}
|
||||
if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
return Ok(Some(item));
|
||||
}
|
||||
// If the key is not contained in the key retaining filters, continue to the next item.
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::delta_layer::test::produce_delta_layer,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
async fn assert_filter_iter_equal(
|
||||
filter_iter: &mut FilterIterator<'_>,
|
||||
expect: &[(Key, Lsn, Value)],
|
||||
) {
|
||||
let mut expect_iter = expect.iter();
|
||||
loop {
|
||||
let o1 = filter_iter.next().await.unwrap();
|
||||
let o2 = expect_iter.next();
|
||||
assert_eq!(o1.is_some(), o2.is_some());
|
||||
if o1.is_none() && o2.is_none() {
|
||||
break;
|
||||
}
|
||||
let (k1, l1, v1) = o1.unwrap();
|
||||
let (k2, l2, v2) = o2.unwrap();
|
||||
assert_eq!(&k1, k2);
|
||||
assert_eq!(l1, *l2);
|
||||
assert_eq!(&v1, v2);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn filter_keyspace_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
const N: usize = 100;
|
||||
let test_deltas1 = (0..N)
|
||||
.map(|idx| {
|
||||
(
|
||||
get_key(idx as u32),
|
||||
Lsn(0x20 * ((idx as u64) % 10 + 1)),
|
||||
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||
)
|
||||
})
|
||||
.collect_vec();
|
||||
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let merge_iter = MergeIterator::create(
|
||||
&[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
|
||||
let mut filter_iter = FilterIterator::create(
|
||||
merge_iter,
|
||||
KeySpace {
|
||||
ranges: vec![
|
||||
get_key(5)..get_key(10),
|
||||
get_key(20)..get_key(30),
|
||||
get_key(90)..get_key(110),
|
||||
get_key(1000)..get_key(2000),
|
||||
],
|
||||
},
|
||||
SparseKeySpace(KeySpace::default()),
|
||||
)
|
||||
.unwrap();
|
||||
let mut result = Vec::new();
|
||||
result.extend(test_deltas1[5..10].iter().cloned());
|
||||
result.extend(test_deltas1[20..30].iter().cloned());
|
||||
result.extend(test_deltas1[90..100].iter().cloned());
|
||||
assert_filter_iter_equal(&mut filter_iter, &result).await;
|
||||
|
||||
let merge_iter = MergeIterator::create(
|
||||
&[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
|
||||
let mut filter_iter = FilterIterator::create(
|
||||
merge_iter,
|
||||
KeySpace {
|
||||
ranges: vec![
|
||||
get_key(0)..get_key(10),
|
||||
get_key(20)..get_key(30),
|
||||
get_key(90)..get_key(95),
|
||||
],
|
||||
},
|
||||
SparseKeySpace(KeySpace::default()),
|
||||
)
|
||||
.unwrap();
|
||||
let mut result = Vec::new();
|
||||
result.extend(test_deltas1[0..10].iter().cloned());
|
||||
result.extend(test_deltas1[20..30].iter().cloned());
|
||||
result.extend(test_deltas1[90..95].iter().cloned());
|
||||
assert_filter_iter_equal(&mut filter_iter, &result).await;
|
||||
}
|
||||
}
|
||||
@@ -36,9 +36,10 @@ use crate::tenant::disk_btree::{
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
@@ -58,7 +59,6 @@ use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_stream::StreamExt;
|
||||
use tracing::*;
|
||||
@@ -70,9 +70,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::layer_name::ImageLayerName;
|
||||
use super::{
|
||||
AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
|
||||
};
|
||||
use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -550,15 +548,15 @@ impl ImageLayerInner {
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
|
||||
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
let img_buf = meta.read(&view).await?;
|
||||
|
||||
key_count += 1;
|
||||
writer
|
||||
.put_image(meta.meta.key, img_buf, ctx)
|
||||
.put_image(meta.meta.key, img_buf.into_bytes(), ctx)
|
||||
.await
|
||||
.context(format!("Storing key {}", meta.meta.key))?;
|
||||
}
|
||||
@@ -605,13 +603,28 @@ impl ImageLayerInner {
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
let img_buf = meta.read(&view).await;
|
||||
|
||||
let img_buf = match img_buf {
|
||||
Ok(img_buf) => img_buf,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to decompress blob from virtual file {}",
|
||||
self.file.path,
|
||||
))),
|
||||
);
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
reconstruct_state.update_key(
|
||||
&meta.meta.key,
|
||||
self.lsn,
|
||||
Value::Image(img_buf),
|
||||
Value::Image(img_buf.into_bytes()),
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -800,10 +813,9 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
async fn finish(
|
||||
self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Option<Key>,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -879,12 +891,9 @@ impl ImageLayerWriterInner {
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
|
||||
// FIXME: why not carry the virtualfile here, it supports renaming?
|
||||
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
|
||||
trace!("created image layer {}", self.path);
|
||||
|
||||
info!("created image layer {}", layer.local_path());
|
||||
|
||||
Ok(layer)
|
||||
Ok((desc, self.path))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -963,24 +972,18 @@ impl ImageLayerWriter {
|
||||
///
|
||||
pub(crate) async fn finish(
|
||||
mut self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<super::ResidentLayer> {
|
||||
self.inner.take().unwrap().finish(timeline, ctx, None).await
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
self.inner.take().unwrap().finish(ctx, None).await
|
||||
}
|
||||
|
||||
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
|
||||
pub(super) async fn finish_with_end_key(
|
||||
mut self,
|
||||
timeline: &Arc<Timeline>,
|
||||
end_key: Key,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<super::ResidentLayer> {
|
||||
self.inner
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(timeline, ctx, Some(end_key))
|
||||
.await
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
self.inner.take().unwrap().finish(ctx, Some(end_key)).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1038,10 +1041,15 @@ impl<'a> ImageLayerIterator<'a> {
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf: Bytes = blobs_buf.buf.freeze();
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf)));
|
||||
let img_buf = meta.read(&view).await?;
|
||||
next_batch.push_back((
|
||||
meta.meta.key,
|
||||
self.image_layer.lsn,
|
||||
Value::Image(img_buf.into_bytes()),
|
||||
));
|
||||
}
|
||||
self.key_values_batch = next_batch;
|
||||
Ok(())
|
||||
@@ -1084,7 +1092,7 @@ mod test {
|
||||
tenant::{
|
||||
config::TenantConf,
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::ResidentLayer,
|
||||
storage_layer::{Layer, ResidentLayer},
|
||||
vectored_blob_io::StreamingVectoredReadPlanner,
|
||||
Tenant, Timeline,
|
||||
},
|
||||
@@ -1155,7 +1163,8 @@ mod test {
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
writer.finish(&timeline, &ctx).await.unwrap()
|
||||
let (desc, path) = writer.finish(&ctx).await.unwrap();
|
||||
Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap()
|
||||
};
|
||||
let original_size = resident.metadata().file_size;
|
||||
|
||||
@@ -1217,7 +1226,9 @@ mod test {
|
||||
.await
|
||||
.unwrap();
|
||||
let replacement = if wrote_keys > 0 {
|
||||
Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
|
||||
let (desc, path) = filtered_writer.finish(&ctx).await.unwrap();
|
||||
let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap();
|
||||
Some(resident)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -1290,7 +1301,8 @@ mod test {
|
||||
for (key, img) in images {
|
||||
writer.put_image(key, img, ctx).await?;
|
||||
}
|
||||
let img_layer = writer.finish(tline, ctx).await?;
|
||||
let (desc, path) = writer.finish(ctx).await?;
|
||||
let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
|
||||
|
||||
Ok::<_, anyhow::Error>(img_layer)
|
||||
}
|
||||
|
||||
@@ -439,11 +439,30 @@ impl Layer {
|
||||
|
||||
fn record_access(&self, ctx: &RequestContext) {
|
||||
if self.0.access_stats.record_access(ctx) {
|
||||
// Visibility was modified to Visible
|
||||
tracing::info!(
|
||||
"Layer {} became visible as a result of access",
|
||||
self.0.desc.key()
|
||||
);
|
||||
// Visibility was modified to Visible: maybe log about this
|
||||
match ctx.task_kind() {
|
||||
TaskKind::CalculateSyntheticSize
|
||||
| TaskKind::GarbageCollector
|
||||
| TaskKind::MgmtRequest => {
|
||||
// This situation is expected in code paths do binary searches of the LSN space to resolve
|
||||
// an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
|
||||
// and on-demand for certain HTTP API requests.
|
||||
}
|
||||
_ => {
|
||||
// In all other contexts, it is unusual to do I/O involving layers which are not visible at
|
||||
// some branch tip, so we log the fact that we are accessing something that the visibility
|
||||
// calculation thought should not be visible.
|
||||
//
|
||||
// This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
|
||||
// which was covered by a concurrent compaction.
|
||||
tracing::info!(
|
||||
"Layer {} became visible as a result of access",
|
||||
self.0.desc.key()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Update the timeline's visible bytes count
|
||||
if let Some(tl) = self.0.timeline.upgrade() {
|
||||
tl.metrics
|
||||
.visible_physical_size_gauge
|
||||
|
||||
@@ -1025,6 +1025,15 @@ fn access_stats() {
|
||||
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
|
||||
access_stats.set_visibility(LayerVisibilityHint::Visible);
|
||||
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
|
||||
|
||||
// Recording access implicitly makes layer visible, if it wasn't already
|
||||
let atime = UNIX_EPOCH + Duration::from_secs(2200000000);
|
||||
access_stats.set_visibility(LayerVisibilityHint::Covered);
|
||||
assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
|
||||
assert!(access_stats.record_access_at(atime));
|
||||
access_stats.set_visibility(LayerVisibilityHint::Visible);
|
||||
assert!(!access_stats.record_access_at(atime));
|
||||
access_stats.set_visibility(LayerVisibilityHint::Visible);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -121,11 +121,11 @@ impl SplitImageLayerWriter {
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
self.generated_layers.push(SplitWriterResult::Produced(
|
||||
prev_image_writer
|
||||
.finish_with_end_key(tline, key, ctx)
|
||||
.await?,
|
||||
));
|
||||
let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
|
||||
|
||||
let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Produced(layer));
|
||||
}
|
||||
}
|
||||
self.inner.put_image(key, img, ctx).await
|
||||
@@ -170,9 +170,9 @@ impl SplitImageLayerWriter {
|
||||
if discard(&layer_key).await {
|
||||
generated_layers.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
generated_layers.push(SplitWriterResult::Produced(
|
||||
inner.finish_with_end_key(tline, end_key, ctx).await?,
|
||||
));
|
||||
let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
|
||||
let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
generated_layers.push(SplitWriterResult::Produced(layer));
|
||||
}
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
@@ -163,8 +163,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
// How many errors we have seen consequtively
|
||||
let mut error_run_count = 0;
|
||||
|
||||
let mut last_throttle_flag_reset_at = Instant::now();
|
||||
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
@@ -191,8 +189,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
let sleep_duration;
|
||||
if period == Duration::ZERO {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
@@ -207,12 +203,18 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
};
|
||||
|
||||
// Run compaction
|
||||
let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
|
||||
let IterationResult { output, elapsed } = iteration
|
||||
.run(tenant.compaction_iteration(&cancel, &ctx))
|
||||
.await;
|
||||
match output {
|
||||
Ok(has_pending_task) => {
|
||||
error_run_count = 0;
|
||||
// schedule the next compaction immediately in case there is a pending compaction task
|
||||
sleep_duration = if has_pending_task { Duration::ZERO } else { period };
|
||||
sleep_duration = if has_pending_task {
|
||||
Duration::ZERO
|
||||
} else {
|
||||
period
|
||||
};
|
||||
}
|
||||
Err(e) => {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
@@ -233,38 +235,20 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
}
|
||||
|
||||
// the duration is recorded by performance tests by enabling debug in this function
|
||||
tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
|
||||
tracing::debug!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"compaction iteration complete"
|
||||
);
|
||||
};
|
||||
|
||||
|
||||
// Perhaps we did no work and the walredo process has been idle for some time:
|
||||
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
|
||||
// TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
|
||||
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
|
||||
if let Some(walredo_mgr) = &tenant.walredo_mgr {
|
||||
walredo_mgr.maybe_quiesce(period * 10);
|
||||
}
|
||||
|
||||
// TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
|
||||
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
|
||||
info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
|
||||
let now = Instant::now();
|
||||
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
|
||||
let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
|
||||
if count_throttled == 0 {
|
||||
return;
|
||||
}
|
||||
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
|
||||
let delta = now - prev;
|
||||
info!(
|
||||
n_seconds=%format_args!("{:.3}",
|
||||
delta.as_secs_f64()),
|
||||
count_accounted,
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
allowed_rps=%format_args!("{allowed_rps:.0}"),
|
||||
"shard was throttled in the last n_seconds"
|
||||
);
|
||||
});
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
@@ -346,6 +330,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
|
||||
let mut first = true;
|
||||
tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
@@ -363,7 +348,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
first = false;
|
||||
|
||||
let delays = async {
|
||||
delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
|
||||
random_init_delay(period, &cancel).await?;
|
||||
Ok::<_, Cancelled>(())
|
||||
};
|
||||
@@ -437,6 +421,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let mut last_throttle_flag_reset_at = Instant::now();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
@@ -483,6 +468,28 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
|
||||
kind: BackgroundLoopKind::IngestHouseKeeping,
|
||||
};
|
||||
iteration.run(tenant.ingest_housekeeping()).await;
|
||||
|
||||
// TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
|
||||
// Or just spawn another background loop for this throttle, it's not like it's super costly.
|
||||
info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
|
||||
let now = Instant::now();
|
||||
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
|
||||
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
|
||||
if count_throttled == 0 {
|
||||
return;
|
||||
}
|
||||
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
|
||||
let delta = now - prev;
|
||||
info!(
|
||||
n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
|
||||
count_accounted = count_accounted_finish, // don't break existing log scraping
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
count_accounted_start, // log after pre-existing fields to not break existing log scraping
|
||||
allowed_rps=%format_args!("{allowed_rps:.0}"),
|
||||
"shard was throttled in the last n_seconds"
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
.await;
|
||||
@@ -538,28 +545,12 @@ pub(crate) async fn random_init_delay(
|
||||
let mut rng = rand::thread_rng();
|
||||
rng.gen_range(Duration::ZERO..=period)
|
||||
};
|
||||
|
||||
match tokio::time::timeout(d, cancel.cancelled()).await {
|
||||
Ok(_) => Err(Cancelled),
|
||||
Err(_) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Delays GC by defaul lease length at restart.
|
||||
///
|
||||
/// We do this as the leases mapping are not persisted to disk. By delaying GC by default
|
||||
/// length, we gurantees that all the leases we granted before the restart will expire
|
||||
/// when we run GC for the first time after the restart.
|
||||
pub(crate) async fn delay_by_lease_length(
|
||||
length: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), Cancelled> {
|
||||
match tokio::time::timeout(length, cancel.cancelled()).await {
|
||||
Ok(_) => Err(Cancelled),
|
||||
Err(_) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
struct Iteration {
|
||||
started_at: Instant,
|
||||
period: Duration,
|
||||
|
||||
@@ -24,8 +24,10 @@ use crate::{context::RequestContext, task_mgr::TaskKind};
|
||||
pub struct Throttle<M: Metric> {
|
||||
inner: ArcSwap<Inner>,
|
||||
metric: M,
|
||||
/// will be turned into [`Stats::count_accounted`]
|
||||
count_accounted: AtomicU64,
|
||||
/// will be turned into [`Stats::count_accounted_start`]
|
||||
count_accounted_start: AtomicU64,
|
||||
/// will be turned into [`Stats::count_accounted_finish`]
|
||||
count_accounted_finish: AtomicU64,
|
||||
/// will be turned into [`Stats::count_throttled`]
|
||||
count_throttled: AtomicU64,
|
||||
/// will be turned into [`Stats::sum_throttled_usecs`]
|
||||
@@ -43,17 +45,21 @@ pub struct Observation {
|
||||
pub wait_time: Duration,
|
||||
}
|
||||
pub trait Metric {
|
||||
fn accounting_start(&self);
|
||||
fn accounting_finish(&self);
|
||||
fn observe_throttling(&self, observation: &Observation);
|
||||
}
|
||||
|
||||
/// See [`Throttle::reset_stats`].
|
||||
pub struct Stats {
|
||||
// Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
|
||||
pub count_accounted: u64,
|
||||
// Subset of the `accounted` requests that were actually throttled.
|
||||
// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
|
||||
/// Number of requests that started [`Throttle::throttle`] calls.
|
||||
pub count_accounted_start: u64,
|
||||
/// Number of requests that finished [`Throttle::throttle`] calls.
|
||||
pub count_accounted_finish: u64,
|
||||
/// Subset of the `accounted` requests that were actually throttled.
|
||||
/// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
|
||||
pub count_throttled: u64,
|
||||
// Sum of microseconds that throttled requests spent waiting for throttling.
|
||||
/// Sum of microseconds that throttled requests spent waiting for throttling.
|
||||
pub sum_throttled_usecs: u64,
|
||||
}
|
||||
|
||||
@@ -65,7 +71,8 @@ where
|
||||
Self {
|
||||
inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
|
||||
metric,
|
||||
count_accounted: AtomicU64::new(0),
|
||||
count_accounted_start: AtomicU64::new(0),
|
||||
count_accounted_finish: AtomicU64::new(0),
|
||||
count_throttled: AtomicU64::new(0),
|
||||
sum_throttled_usecs: AtomicU64::new(0),
|
||||
}
|
||||
@@ -117,11 +124,13 @@ where
|
||||
/// This method allows retrieving & resetting that flag.
|
||||
/// Useful for periodic reporting.
|
||||
pub fn reset_stats(&self) -> Stats {
|
||||
let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
|
||||
let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed);
|
||||
let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed);
|
||||
let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
|
||||
let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
|
||||
Stats {
|
||||
count_accounted,
|
||||
count_accounted_start,
|
||||
count_accounted_finish,
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
}
|
||||
@@ -139,9 +148,12 @@ where
|
||||
};
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
self.metric.accounting_start();
|
||||
self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
|
||||
let did_throttle = inner.rate_limiter.acquire(key_count).await;
|
||||
self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
|
||||
self.metric.accounting_finish();
|
||||
|
||||
self.count_accounted.fetch_add(1, Ordering::Relaxed);
|
||||
if did_throttle {
|
||||
self.count_throttled.fetch_add(1, Ordering::Relaxed);
|
||||
let now = Instant::now();
|
||||
|
||||
@@ -196,9 +196,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
/// The outward-facing resources required to build a Timeline
|
||||
pub struct TimelineResources {
|
||||
pub remote_client: RemoteTimelineClient,
|
||||
pub timeline_get_throttle: Arc<
|
||||
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
|
||||
>,
|
||||
pub timeline_get_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
}
|
||||
|
||||
@@ -406,9 +405,8 @@ pub struct Timeline {
|
||||
gc_lock: tokio::sync::Mutex<()>,
|
||||
|
||||
/// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
|
||||
timeline_get_throttle: Arc<
|
||||
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
|
||||
>,
|
||||
timeline_get_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
|
||||
/// Keep aux directory cache to avoid it's reconstruction on each update
|
||||
pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
|
||||
@@ -4013,7 +4011,9 @@ impl Timeline {
|
||||
if wrote_keys {
|
||||
// Normal path: we have written some data into the new image layer for this
|
||||
// partition, so flush it to disk.
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
let (desc, path) = image_layer_writer.finish(ctx).await?;
|
||||
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
info!("created image layer for rel {}", image_layer.local_path());
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: Some(image_layer),
|
||||
next_start_key: img_range.end,
|
||||
@@ -4101,7 +4101,12 @@ impl Timeline {
|
||||
if wrote_any_image {
|
||||
// Normal path: we have written some data into the new image layer for this
|
||||
// partition, so flush it to disk.
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
let (desc, path) = image_layer_writer.finish(ctx).await?;
|
||||
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
info!(
|
||||
"created image layer for metadata {}",
|
||||
image_layer.local_path()
|
||||
);
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: Some(image_layer),
|
||||
next_start_key: img_range.end,
|
||||
@@ -4309,7 +4314,9 @@ impl Timeline {
|
||||
timer.stop_and_record();
|
||||
|
||||
// Creating image layers may have caused some previously visible layers to be covered
|
||||
self.update_layer_visibility().await?;
|
||||
if !image_layers.is_empty() {
|
||||
self.update_layer_visibility().await?;
|
||||
}
|
||||
|
||||
Ok(image_layers)
|
||||
}
|
||||
@@ -5371,7 +5378,8 @@ impl Timeline {
|
||||
/// Force create an image layer and place it into the layer map.
|
||||
///
|
||||
/// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
|
||||
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
|
||||
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
|
||||
/// placed into the layer map in one run AND be validated.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn force_create_image_layer(
|
||||
self: &Arc<Timeline>,
|
||||
@@ -5403,8 +5411,9 @@ impl Timeline {
|
||||
for (key, img) in images {
|
||||
image_layer_writer.put_image(key, img, ctx).await?;
|
||||
}
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
|
||||
let (desc, path) = image_layer_writer.finish(ctx).await?;
|
||||
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
info!("force created image layer {}", image_layer.local_path());
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.open_mut().unwrap().force_insert_layer(image_layer);
|
||||
@@ -5416,7 +5425,8 @@ impl Timeline {
|
||||
/// Force create a delta layer and place it into the layer map.
|
||||
///
|
||||
/// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
|
||||
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
|
||||
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
|
||||
/// placed into the layer map in one run AND be validated.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn force_create_delta_layer(
|
||||
self: &Arc<Timeline>,
|
||||
@@ -5442,33 +5452,6 @@ impl Timeline {
|
||||
if let Some(check_start_lsn) = check_start_lsn {
|
||||
assert!(deltas.lsn_range.start >= check_start_lsn);
|
||||
}
|
||||
// check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
|
||||
// layers of the same start/end LSN, and so should the force inserted layer
|
||||
{
|
||||
/// Checks if a overlaps with b, assume a/b = [start, end).
|
||||
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||
!(a.end <= b.start || b.end <= a.start)
|
||||
}
|
||||
|
||||
if deltas.key_range.start.next() != deltas.key_range.end {
|
||||
let guard = self.layers.read().await;
|
||||
let mut invalid_layers =
|
||||
guard.layer_map()?.iter_historic_layers().filter(|layer| {
|
||||
layer.is_delta()
|
||||
&& overlaps_with(&layer.lsn_range, &deltas.lsn_range)
|
||||
&& layer.lsn_range != deltas.lsn_range
|
||||
// skip single-key layer files
|
||||
&& layer.key_range.start.next() != layer.key_range.end
|
||||
});
|
||||
if let Some(layer) = invalid_layers.next() {
|
||||
// If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
|
||||
panic!(
|
||||
"inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
|
||||
deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
@@ -5483,7 +5466,7 @@ impl Timeline {
|
||||
}
|
||||
let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
|
||||
info!("force created delta layer {}", delta_layer.local_path());
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.open_mut().unwrap().force_insert_layer(delta_layer);
|
||||
|
||||
@@ -29,7 +29,9 @@ use utils::id::TimelineId;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::filter_iterator::FilterIterator;
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::split_writer::{
|
||||
SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
|
||||
@@ -563,10 +565,12 @@ impl Timeline {
|
||||
.await?;
|
||||
|
||||
if keys_written > 0 {
|
||||
let new_layer = image_layer_writer
|
||||
.finish(self, ctx)
|
||||
let (desc, path) = image_layer_writer
|
||||
.finish(ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.map_err(CompactionError::Other)?;
|
||||
tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
|
||||
layer.metadata().file_size,
|
||||
new_layer.metadata().file_size);
|
||||
@@ -1769,6 +1773,7 @@ impl Timeline {
|
||||
gc_cutoff,
|
||||
lowest_retain_lsn
|
||||
);
|
||||
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
|
||||
@@ -1786,20 +1791,12 @@ impl Timeline {
|
||||
stat.visit_image_layer(desc.file_size());
|
||||
}
|
||||
}
|
||||
for layer in &layer_selection {
|
||||
let desc = layer.layer_desc();
|
||||
let key_range = &desc.key_range;
|
||||
if desc.is_delta() && key_range.start.next() != key_range.end {
|
||||
let lsn_range = desc.lsn_range.clone();
|
||||
let intersects = lsn_split_point.range(lsn_range).collect_vec();
|
||||
if intersects.len() > 1 {
|
||||
bail!(
|
||||
"cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
|
||||
desc.key(),
|
||||
intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
|
||||
);
|
||||
}
|
||||
}
|
||||
let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
|
||||
.iter()
|
||||
.map(|layer| layer.layer_desc().layer_name())
|
||||
.collect_vec();
|
||||
if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
bail!("cannot run gc-compaction because {}", err);
|
||||
}
|
||||
// The maximum LSN we are processing in this compaction loop
|
||||
let end_lsn = layer_selection
|
||||
@@ -1825,7 +1822,12 @@ impl Timeline {
|
||||
image_layers.push(layer);
|
||||
}
|
||||
}
|
||||
let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
|
||||
let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
|
||||
let mut merge_iter = FilterIterator::create(
|
||||
MergeIterator::create(&delta_layers, &image_layers, ctx),
|
||||
dense_ks,
|
||||
sparse_ks,
|
||||
)?;
|
||||
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
|
||||
// Data of the same key.
|
||||
let mut accumulated_values = Vec::new();
|
||||
|
||||
@@ -135,25 +135,6 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
|
||||
.context("delete_all")
|
||||
}
|
||||
|
||||
// This function removs remaining traces of a timeline on disk.
|
||||
// Namely: metadata file, timeline directory, delete mark.
|
||||
// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
|
||||
// delete mark should be present because it is the last step during deletion.
|
||||
// (nothing can fail after its deletion)
|
||||
async fn cleanup_remaining_timeline_fs_traces(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove delete mark
|
||||
// TODO: once we are confident that no more exist in the field, remove this
|
||||
// line. It cleans up a legacy marker file that might in rare cases be present.
|
||||
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.context("remove delete mark")
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
async fn remove_timeline_from_tenant(
|
||||
@@ -194,12 +175,10 @@ async fn remove_timeline_from_tenant(
|
||||
/// 7. Delete mark file
|
||||
///
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are three entrypoints to the process:
|
||||
/// There are two entrypoints to the process:
|
||||
/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
|
||||
/// and we possibly neeed to continue deletion of remote files.
|
||||
/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
|
||||
/// index but still have local metadata, timeline directory and delete mark.
|
||||
///
|
||||
/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
|
||||
#[derive(Default)]
|
||||
@@ -311,18 +290,6 @@ impl DeleteTimelineFlow {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(%timeline_id))]
|
||||
pub async fn cleanup_remaining_timeline_fs_traces(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
let r =
|
||||
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
|
||||
.await;
|
||||
info!("Done");
|
||||
r
|
||||
}
|
||||
|
||||
fn prepare(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
@@ -30,8 +30,8 @@ use crate::{
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
|
||||
tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -557,6 +557,8 @@ impl Timeline {
|
||||
gather_result = gather => {
|
||||
match gather_result {
|
||||
Ok(_) => {},
|
||||
// It can happen sometimes that we hit this instead of the cancellation token firing above
|
||||
Err(CalculateSyntheticSizeError::Cancelled) => {}
|
||||
Err(e) => {
|
||||
// We don't care about the result, but, if it failed, we should log it,
|
||||
// since consumption metric might be hitting the cached value and
|
||||
|
||||
@@ -16,8 +16,9 @@
|
||||
//! Note that the vectored blob api does *not* go through the page cache.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Deref;
|
||||
|
||||
use bytes::BytesMut;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::Key;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
@@ -35,11 +36,123 @@ pub struct BlobMeta {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Blob offsets into [`VectoredBlobsBuf::buf`]
|
||||
/// A view into the vectored blobs read buffer.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) enum BufView<'a> {
|
||||
Slice(&'a [u8]),
|
||||
Bytes(bytes::Bytes),
|
||||
}
|
||||
|
||||
impl<'a> BufView<'a> {
|
||||
/// Creates a new slice-based view on the blob.
|
||||
pub fn new_slice(slice: &'a [u8]) -> Self {
|
||||
Self::Slice(slice)
|
||||
}
|
||||
|
||||
/// Creates a new [`bytes::Bytes`]-based view on the blob.
|
||||
pub fn new_bytes(bytes: bytes::Bytes) -> Self {
|
||||
Self::Bytes(bytes)
|
||||
}
|
||||
|
||||
/// Convert the view into `Bytes`.
|
||||
///
|
||||
/// If using slice as the underlying storage, the copy will be an O(n) operation.
|
||||
pub fn into_bytes(self) -> Bytes {
|
||||
match self {
|
||||
BufView::Slice(slice) => Bytes::copy_from_slice(slice),
|
||||
BufView::Bytes(bytes) => bytes,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a sub-view of the blob based on the range.
|
||||
fn view(&self, range: std::ops::Range<usize>) -> Self {
|
||||
match self {
|
||||
BufView::Slice(slice) => BufView::Slice(&slice[range]),
|
||||
BufView::Bytes(bytes) => BufView::Bytes(bytes.slice(range)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for BufView<'a> {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match self {
|
||||
BufView::Slice(slice) => slice,
|
||||
BufView::Bytes(bytes) => bytes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> AsRef<[u8]> for BufView<'a> {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
match self {
|
||||
BufView::Slice(slice) => slice,
|
||||
BufView::Bytes(bytes) => bytes.as_ref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for BufView<'a> {
|
||||
fn from(value: &'a [u8]) -> Self {
|
||||
Self::new_slice(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Bytes> for BufView<'_> {
|
||||
fn from(value: Bytes) -> Self {
|
||||
Self::new_bytes(value)
|
||||
}
|
||||
}
|
||||
|
||||
/// Blob offsets into [`VectoredBlobsBuf::buf`]. The byte ranges is potentially compressed,
|
||||
/// subject to [`VectoredBlob::compression_bits`].
|
||||
pub struct VectoredBlob {
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
/// Blob metadata.
|
||||
pub meta: BlobMeta,
|
||||
/// Start offset.
|
||||
start: usize,
|
||||
/// End offset.
|
||||
end: usize,
|
||||
/// Compression used on the the blob.
|
||||
compression_bits: u8,
|
||||
}
|
||||
|
||||
impl VectoredBlob {
|
||||
/// Reads a decompressed view of the blob.
|
||||
pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result<BufView<'a>, std::io::Error> {
|
||||
let view = buf.view(self.start..self.end);
|
||||
|
||||
match self.compression_bits {
|
||||
BYTE_UNCOMPRESSED => Ok(view),
|
||||
BYTE_ZSTD => {
|
||||
let mut decompressed_vec = Vec::new();
|
||||
let mut decoder =
|
||||
async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
|
||||
decoder.write_all(&view).await?;
|
||||
decoder.flush().await?;
|
||||
// Zero-copy conversion from `Vec` to `Bytes`
|
||||
Ok(BufView::new_bytes(Bytes::from(decompressed_vec)))
|
||||
}
|
||||
bits => {
|
||||
let error = std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end),
|
||||
);
|
||||
Err(error)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for VectoredBlob {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}@{}, {}..{}",
|
||||
self.meta.key, self.meta.lsn, self.start, self.end
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Return type of [`VectoredBlobReader::read_blobs`]
|
||||
@@ -514,7 +627,7 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
);
|
||||
}
|
||||
|
||||
let mut buf = self
|
||||
let buf = self
|
||||
.file
|
||||
.read_exact_at(buf.slice(0..read.size()), read.start, ctx)
|
||||
.await?
|
||||
@@ -529,9 +642,6 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
// of a blob is implicit: the start of the next blob if one exists
|
||||
// or the end of the read.
|
||||
|
||||
// Some scratch space, put here for reusing the allocation
|
||||
let mut decompressed_vec = Vec::new();
|
||||
|
||||
for (blob_start, meta) in blobs_at {
|
||||
let blob_start_in_buf = blob_start - start_offset;
|
||||
let first_len_byte = buf[blob_start_in_buf as usize];
|
||||
@@ -557,35 +667,14 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
)
|
||||
};
|
||||
|
||||
let start_raw = blob_start_in_buf + size_length;
|
||||
let end_raw = start_raw + blob_size;
|
||||
let (start, end);
|
||||
if compression_bits == BYTE_UNCOMPRESSED {
|
||||
start = start_raw as usize;
|
||||
end = end_raw as usize;
|
||||
} else if compression_bits == BYTE_ZSTD {
|
||||
let mut decoder =
|
||||
async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
|
||||
decoder
|
||||
.write_all(&buf[start_raw as usize..end_raw as usize])
|
||||
.await?;
|
||||
decoder.flush().await?;
|
||||
start = buf.len();
|
||||
buf.extend_from_slice(&decompressed_vec);
|
||||
end = buf.len();
|
||||
decompressed_vec.clear();
|
||||
} else {
|
||||
let error = std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!("invalid compression byte {compression_bits:x}"),
|
||||
);
|
||||
return Err(error);
|
||||
}
|
||||
let start = (blob_start_in_buf + size_length) as usize;
|
||||
let end = start + blob_size as usize;
|
||||
|
||||
metas.push(VectoredBlob {
|
||||
start,
|
||||
end,
|
||||
meta: *meta,
|
||||
compression_bits,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1020,8 +1109,13 @@ mod tests {
|
||||
let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
|
||||
assert_eq!(result.blobs.len(), 1);
|
||||
let read_blob = &result.blobs[0];
|
||||
let read_buf = &result.buf[read_blob.start..read_blob.end];
|
||||
assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
|
||||
let view = BufView::new_slice(&result.buf);
|
||||
let read_buf = read_blob.read(&view).await?;
|
||||
assert_eq!(
|
||||
&blob[..],
|
||||
&read_buf[..],
|
||||
"mismatch for idx={idx} at offset={offset}"
|
||||
);
|
||||
buf = result.buf;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -35,6 +35,7 @@ use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::future::Future;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
@@ -204,6 +205,22 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a ping request-response roundtrip.
|
||||
///
|
||||
/// Not used in production, but by Rust benchmarks.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
|
||||
self.do_with_walredo_process(pg_version, |proc| async move {
|
||||
proc.ping(Duration::from_secs(1))
|
||||
.await
|
||||
.map_err(Error::Other)
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn status(&self) -> WalRedoManagerStatus {
|
||||
WalRedoManagerStatus {
|
||||
last_redo_at: {
|
||||
@@ -296,6 +313,100 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancel-safe iff `closure` is cancel-safe.
|
||||
async fn do_with_walredo_process<
|
||||
F: FnOnce(Arc<Process>) -> Fut,
|
||||
Fut: Future<Output = Result<O, Error>>,
|
||||
O,
|
||||
>(
|
||||
&self,
|
||||
pg_version: u32,
|
||||
closure: F,
|
||||
) -> Result<O, Error> {
|
||||
let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => match &*guard {
|
||||
ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
|
||||
ProcessOnceCell::ManagerShutDown => {
|
||||
return Err(Error::Cancelled);
|
||||
}
|
||||
},
|
||||
Err(permit) => {
|
||||
let start = Instant::now();
|
||||
// acquire guard before spawning process, so that we don't spawn new processes
|
||||
// if the gate is already closed.
|
||||
let _launched_processes_guard = match self.launched_processes.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(GateError::GateClosed) => unreachable!(
|
||||
"shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
|
||||
),
|
||||
};
|
||||
let proc = Arc::new(Process {
|
||||
process: process::WalRedoProcess::launch(
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
pg_version,
|
||||
)
|
||||
.context("launch walredo process")?,
|
||||
_launched_processes_guard,
|
||||
});
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
elapsed_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
|
||||
// async closures are unstable, would support &Process
|
||||
let result = closure(proc.clone()).await;
|
||||
|
||||
if result.is_err() {
|
||||
// Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
|
||||
// Note that there may be other tasks concurrent with us that also hold `proc`.
|
||||
// We have to deal with that here.
|
||||
// Also read the doc comment on field `self.redo_process`.
|
||||
//
|
||||
// NB: there may still be other concurrent threads using `proc`.
|
||||
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
|
||||
//
|
||||
// NB: the drop impl blocks the dropping thread with a wait() system call for
|
||||
// the child process. In some ways the blocking is actually good: if we
|
||||
// deferred the waiting into the background / to tokio if we used `tokio::process`,
|
||||
// it could happen that if walredo always fails immediately, we spawn processes faster
|
||||
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
|
||||
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
|
||||
// This probably needs revisiting at some later point.
|
||||
match self.redo_process.get() {
|
||||
None => (),
|
||||
Some(guard) => {
|
||||
match &*guard {
|
||||
ProcessOnceCell::ManagerShutDown => {}
|
||||
ProcessOnceCell::Spawned(guard_proc) => {
|
||||
if Arc::ptr_eq(&proc, guard_proc) {
|
||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||
guard.take_and_deinit();
|
||||
} else {
|
||||
// Another task already spawned another redo process (further up in this method)
|
||||
// and put it into `redo_process`. Do nothing, our view of the world is behind.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
|
||||
drop(proc);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
///
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
///
|
||||
@@ -319,130 +430,63 @@ impl PostgresRedoManager {
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => match &*guard {
|
||||
ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
|
||||
ProcessOnceCell::ManagerShutDown => {
|
||||
return Err(Error::Cancelled);
|
||||
}
|
||||
},
|
||||
Err(permit) => {
|
||||
let start = Instant::now();
|
||||
// acquire guard before spawning process, so that we don't spawn new processes
|
||||
// if the gate is already closed.
|
||||
let _launched_processes_guard = match self.launched_processes.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(GateError::GateClosed) => unreachable!(
|
||||
"shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
|
||||
),
|
||||
};
|
||||
let proc = Arc::new(Process {
|
||||
process: process::WalRedoProcess::launch(
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
pg_version,
|
||||
)
|
||||
.context("launch walredo process")?,
|
||||
_launched_processes_guard,
|
||||
});
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
let base_img = &base_img;
|
||||
let closure = |proc: Arc<Process>| async move {
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let result = proc
|
||||
.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
.context("apply_wal_records");
|
||||
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let result = proc
|
||||
.apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
.context("apply_wal_records");
|
||||
let duration = started_at.elapsed();
|
||||
|
||||
let duration = started_at.elapsed();
|
||||
|
||||
let len = records.len();
|
||||
let nbytes = records.iter().fold(0, |acumulator, record| {
|
||||
acumulator
|
||||
+ match &record.1 {
|
||||
NeonWalRecord::Postgres { rec, .. } => rec.len(),
|
||||
_ => unreachable!("Only PostgreSQL records are accepted in this batch"),
|
||||
}
|
||||
});
|
||||
|
||||
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
||||
WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
|
||||
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
|
||||
|
||||
debug!(
|
||||
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
|
||||
len,
|
||||
nbytes,
|
||||
duration.as_micros(),
|
||||
lsn
|
||||
);
|
||||
|
||||
// If something went wrong, don't try to reuse the process. Kill it, and
|
||||
// next request will launch a new one.
|
||||
if let Err(e) = result.as_ref() {
|
||||
error!(
|
||||
"error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
|
||||
records.len(),
|
||||
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
nbytes,
|
||||
base_img_lsn,
|
||||
lsn,
|
||||
n_attempts,
|
||||
e,
|
||||
);
|
||||
// Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
|
||||
// Note that there may be other tasks concurrent with us that also hold `proc`.
|
||||
// We have to deal with that here.
|
||||
// Also read the doc comment on field `self.redo_process`.
|
||||
//
|
||||
// NB: there may still be other concurrent threads using `proc`.
|
||||
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
|
||||
//
|
||||
// NB: the drop impl blocks the dropping thread with a wait() system call for
|
||||
// the child process. In some ways the blocking is actually good: if we
|
||||
// deferred the waiting into the background / to tokio if we used `tokio::process`,
|
||||
// it could happen that if walredo always fails immediately, we spawn processes faster
|
||||
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
|
||||
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
|
||||
// This probably needs revisiting at some later point.
|
||||
match self.redo_process.get() {
|
||||
None => (),
|
||||
Some(guard) => {
|
||||
match &*guard {
|
||||
ProcessOnceCell::ManagerShutDown => {}
|
||||
ProcessOnceCell::Spawned(guard_proc) => {
|
||||
if Arc::ptr_eq(&proc, guard_proc) {
|
||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||
guard.take_and_deinit();
|
||||
} else {
|
||||
// Another task already spawned another redo process (further up in this method)
|
||||
// and put it into `redo_process`. Do nothing, our view of the world is behind.
|
||||
}
|
||||
}
|
||||
let len = records.len();
|
||||
let nbytes = records.iter().fold(0, |acumulator, record| {
|
||||
acumulator
|
||||
+ match &record.1 {
|
||||
NeonWalRecord::Postgres { rec, .. } => rec.len(),
|
||||
_ => unreachable!("Only PostgreSQL records are accepted in this batch"),
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
||||
WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
|
||||
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
|
||||
|
||||
debug!(
|
||||
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
|
||||
len,
|
||||
nbytes,
|
||||
duration.as_micros(),
|
||||
lsn
|
||||
);
|
||||
|
||||
if let Err(e) = result.as_ref() {
|
||||
error!(
|
||||
"error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
|
||||
records.len(),
|
||||
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
nbytes,
|
||||
base_img_lsn,
|
||||
lsn,
|
||||
n_attempts,
|
||||
e,
|
||||
);
|
||||
}
|
||||
// The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
|
||||
drop(proc);
|
||||
} else if n_attempts != 0 {
|
||||
|
||||
result.map_err(Error::Other)
|
||||
};
|
||||
let result = self.do_with_walredo_process(pg_version, closure).await;
|
||||
|
||||
if result.is_ok() && n_attempts != 0 {
|
||||
info!(n_attempts, "retried walredo succeeded");
|
||||
}
|
||||
n_attempts += 1;
|
||||
if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
|
||||
return result.map_err(Error::Other);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -512,6 +556,17 @@ mod tests {
|
||||
use tracing::Instrument;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ping() {
|
||||
let h = RedoHarness::new().unwrap();
|
||||
|
||||
h.manager
|
||||
.ping(14)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
.expect("ping should work");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn short_v14_redo() {
|
||||
let expected = std::fs::read("test_data/short_v14_redo.page").unwrap();
|
||||
|
||||
@@ -6,6 +6,7 @@ use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
page_cache::PAGE_SZ,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
@@ -237,6 +238,26 @@ impl WalRedoProcess {
|
||||
res
|
||||
}
|
||||
|
||||
/// Do a ping request-response roundtrip.
|
||||
///
|
||||
/// Not used in production, but by Rust benchmarks.
|
||||
pub(crate) async fn ping(&self, timeout: Duration) -> anyhow::Result<()> {
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity(4);
|
||||
protocol::build_ping_msg(&mut writebuf);
|
||||
let Ok(res) = tokio::time::timeout(timeout, self.apply_wal_records0(&writebuf)).await
|
||||
else {
|
||||
anyhow::bail!("WAL redo ping timed out");
|
||||
};
|
||||
let response = res?;
|
||||
if response.len() != PAGE_SZ {
|
||||
anyhow::bail!(
|
||||
"WAL redo ping response should respond with page-sized response: {}",
|
||||
response.len()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// When not polled to completion (e.g. because in `tokio::select!` another
|
||||
|
||||
@@ -55,3 +55,8 @@ pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
}
|
||||
|
||||
pub(crate) fn build_ping_msg(buf: &mut Vec<u8>) {
|
||||
buf.put_u8(b'H');
|
||||
buf.put_u32(4);
|
||||
}
|
||||
|
||||
@@ -9,6 +9,8 @@ OBJS = \
|
||||
hll.o \
|
||||
libpagestore.o \
|
||||
neon.o \
|
||||
neon_pgversioncompat.o \
|
||||
neon_perf_counters.o \
|
||||
neon_utils.o \
|
||||
neon_walreader.o \
|
||||
pagestore_smgr.o \
|
||||
@@ -23,7 +25,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
|
||||
SHLIB_LINK = -lcurl
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql neon--1.4--1.5.sql neon--1.5--1.4.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
EXTRA_CLEAN = \
|
||||
|
||||
@@ -109,6 +109,7 @@ typedef struct FileCacheControl
|
||||
* reenabling */
|
||||
uint32 size; /* size of cache file in chunks */
|
||||
uint32 used; /* number of used chunks */
|
||||
uint32 used_pages; /* number of used pages */
|
||||
uint32 limit; /* shared copy of lfc_size_limit */
|
||||
uint64 hits;
|
||||
uint64 misses;
|
||||
@@ -905,6 +906,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
{
|
||||
lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
|
||||
}
|
||||
CriticalAssert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
@@ -959,6 +964,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
for (int i = 0; i < blocks_in_chunk; i++)
|
||||
{
|
||||
lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1);
|
||||
entry->bitmap[(chunk_offs + i) >> 5] |=
|
||||
(1 << ((chunk_offs + i) & 31));
|
||||
}
|
||||
@@ -1051,6 +1057,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->size;
|
||||
break;
|
||||
case 5:
|
||||
key = "file_cache_used_pages";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->used_pages;
|
||||
break;
|
||||
default:
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "utils/guc.h"
|
||||
|
||||
#include "neon.h"
|
||||
#include "neon_perf_counters.h"
|
||||
#include "neon_utils.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "walproposer.h"
|
||||
@@ -331,6 +332,7 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
|
||||
}
|
||||
if (shard->conn)
|
||||
{
|
||||
MyNeonCounters->pageserver_disconnects_total++;
|
||||
PQfinish(shard->conn);
|
||||
shard->conn = NULL;
|
||||
}
|
||||
@@ -737,6 +739,8 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn;
|
||||
|
||||
MyNeonCounters->pageserver_requests_sent_total++;
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
{
|
||||
@@ -889,6 +893,7 @@ pageserver_flush(shardno_t shard_no)
|
||||
}
|
||||
else
|
||||
{
|
||||
MyNeonCounters->pageserver_send_flushes_total++;
|
||||
if (PQflush(pageserver_conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
@@ -922,7 +927,7 @@ check_neon_id(char **newval, void **extra, GucSource source)
|
||||
static Size
|
||||
PagestoreShmemSize(void)
|
||||
{
|
||||
return sizeof(PagestoreShmemState);
|
||||
return add_size(sizeof(PagestoreShmemState), NeonPerfCountersShmemSize());
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -941,6 +946,9 @@ PagestoreShmemInit(void)
|
||||
memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
}
|
||||
|
||||
NeonPerfCountersShmemInit();
|
||||
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
return found;
|
||||
}
|
||||
|
||||
39
pgxn/neon/neon--1.4--1.5.sql
Normal file
39
pgxn/neon/neon--1.4--1.5.sql
Normal file
@@ -0,0 +1,39 @@
|
||||
\echo Use "ALTER EXTENSION neon UPDATE TO '1.5'" to load this file. \quit
|
||||
|
||||
|
||||
CREATE FUNCTION get_backend_perf_counters()
|
||||
RETURNS SETOF RECORD
|
||||
AS 'MODULE_PATHNAME', 'neon_get_backend_perf_counters'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
CREATE FUNCTION get_perf_counters()
|
||||
RETURNS SETOF RECORD
|
||||
AS 'MODULE_PATHNAME', 'neon_get_perf_counters'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
-- Show various metrics, for each backend. Note that the values are not reset
|
||||
-- when a backend exits. When a new backend starts with the backend ID, it will
|
||||
-- continue accumulating the values from where the old backend left. If you are
|
||||
-- only interested in the changes from your own session, store the values at the
|
||||
-- beginning of the session somewhere, and subtract them on subsequent calls.
|
||||
--
|
||||
-- For histograms, 'bucket_le' is the upper bound of the histogram bucket.
|
||||
CREATE VIEW neon_backend_perf_counters AS
|
||||
SELECT P.procno, P.pid, P.metric, P.bucket_le, P.value
|
||||
FROM get_backend_perf_counters() AS P (
|
||||
procno integer,
|
||||
pid integer,
|
||||
metric text,
|
||||
bucket_le float8,
|
||||
value float8
|
||||
);
|
||||
|
||||
-- Summary across all backends. (This could also be implemented with
|
||||
-- an aggregate query over neon_backend_perf_counters view.)
|
||||
CREATE VIEW neon_perf_counters AS
|
||||
SELECT P.metric, P.bucket_le, P.value
|
||||
FROM get_perf_counters() AS P (
|
||||
metric text,
|
||||
bucket_le float8,
|
||||
value float8
|
||||
);
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user