mirror of
https://github.com/neondatabase/neon.git
synced 2026-07-04 04:30:38 +00:00
Compare commits
61 Commits
use_keepal
...
heikki/deb
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8b9b395be4 | ||
|
|
4d2328ebe3 | ||
|
|
9f81828429 | ||
|
|
9ab13d6e2c | ||
|
|
983e18e63e | ||
|
|
b735df6ff0 | ||
|
|
68cf0ba439 | ||
|
|
d04d924649 | ||
|
|
f5fdaa6dc6 | ||
|
|
c54cd9e76a | ||
|
|
1010b8add4 | ||
|
|
ae4b2af299 | ||
|
|
15fecb8474 | ||
|
|
47677ba578 | ||
|
|
83b6bfa229 | ||
|
|
ed942b05f7 | ||
|
|
62a717a2ca | ||
|
|
c8fbbb9b65 | ||
|
|
d73f4a6470 | ||
|
|
5477d7db93 | ||
|
|
eb9832d846 | ||
|
|
3d36dfe533 | ||
|
|
ebf44210ba | ||
|
|
aabf455dfb | ||
|
|
aec92bfc34 | ||
|
|
b0b4b7dd8f | ||
|
|
4dd4096f11 | ||
|
|
be718ed121 | ||
|
|
9f1408fdf3 | ||
|
|
7000aaaf75 | ||
|
|
ef2a2555b1 | ||
|
|
d8ab6ddb0f | ||
|
|
dcc437da1d | ||
|
|
c286fea018 | ||
|
|
de8276488d | ||
|
|
ddb9ae1214 | ||
|
|
9e55d79803 | ||
|
|
8d47a60de2 | ||
|
|
6166482589 | ||
|
|
ca6d72ba2a | ||
|
|
b6c0f66619 | ||
|
|
3702ec889f | ||
|
|
8e8df1b453 | ||
|
|
92d95b08cf | ||
|
|
0af40b5494 | ||
|
|
c60b91369a | ||
|
|
f1473dd438 | ||
|
|
c283aaaf8d | ||
|
|
414ed82c1f | ||
|
|
881e351f69 | ||
|
|
b31ce14083 | ||
|
|
b4d87b9dfe | ||
|
|
2b49d6ee05 | ||
|
|
14e1f89053 | ||
|
|
8a8c656c06 | ||
|
|
a75e11cc00 | ||
|
|
7d4bfcdc47 | ||
|
|
737888e5c9 | ||
|
|
19bf7b78a0 | ||
|
|
7e4a39ea53 | ||
|
|
624a507544 |
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -4,6 +4,7 @@ self-hosted-runner:
|
||||
- large
|
||||
- large-arm64
|
||||
- small
|
||||
- small-metal
|
||||
- small-arm64
|
||||
- us-east-2
|
||||
config-variables:
|
||||
|
||||
48
.github/actions/neon-project-create/action.yml
vendored
48
.github/actions/neon-project-create/action.yml
vendored
@@ -17,6 +17,31 @@ inputs:
|
||||
compute_units:
|
||||
description: '[Min, Max] compute units'
|
||||
default: '[1, 1]'
|
||||
# settings below only needed if you want the project to be sharded from the beginning
|
||||
shard_split_project:
|
||||
description: 'by default new projects are not shard-split, specify true to shard-split'
|
||||
required: false
|
||||
default: 'false'
|
||||
admin_api_key:
|
||||
description: 'Admin API Key needed for shard-splitting. Must be specified if shard_split_project is true'
|
||||
required: false
|
||||
shard_count:
|
||||
description: 'Number of shards to split the project into, only applies if shard_split_project is true'
|
||||
required: false
|
||||
default: '8'
|
||||
stripe_size:
|
||||
description: 'Stripe size, optional, in 8kiB pages. e.g. set 2048 for 16MB stripes. Default is 128 MiB, only applies if shard_split_project is true'
|
||||
required: false
|
||||
default: '32768'
|
||||
psql_path:
|
||||
description: 'Path to psql binary - it is caller responsibility to provision the psql binary'
|
||||
required: false
|
||||
default: '/tmp/neon/pg_install/v16/bin/psql'
|
||||
libpq_lib_path:
|
||||
description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
|
||||
required: false
|
||||
default: '/tmp/neon/pg_install/v16/lib'
|
||||
|
||||
|
||||
outputs:
|
||||
dsn:
|
||||
@@ -63,6 +88,23 @@ runs:
|
||||
echo "project_id=${project_id}" >> $GITHUB_OUTPUT
|
||||
|
||||
echo "Project ${project_id} has been created"
|
||||
|
||||
if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
|
||||
# determine tenant ID
|
||||
TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
|
||||
|
||||
echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"
|
||||
|
||||
echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
|
||||
echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
|
||||
|
||||
# we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
|
||||
curl -X PUT \
|
||||
"https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
|
||||
-H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
|
||||
-d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}"
|
||||
fi
|
||||
|
||||
env:
|
||||
API_HOST: ${{ inputs.api_host }}
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
@@ -70,3 +112,9 @@ runs:
|
||||
POSTGRES_VERSION: ${{ inputs.postgres_version }}
|
||||
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
|
||||
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
|
||||
SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }}
|
||||
ADMIN_API_KEY: ${{ inputs.admin_api_key }}
|
||||
SHARD_COUNT: ${{ inputs.shard_count }}
|
||||
STRIPE_SIZE: ${{ inputs.stripe_size }}
|
||||
PSQL: ${{ inputs.psql_path }}
|
||||
LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
|
||||
|
||||
14
.github/workflows/_build-and-test-locally.yml
vendored
14
.github/workflows/_build-and-test-locally.yml
vendored
@@ -158,8 +158,6 @@ jobs:
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
# Do install *before* running rust tests because they might recompile the
|
||||
@@ -217,8 +215,6 @@ jobs:
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
@@ -229,8 +225,13 @@ jobs:
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
|
||||
|
||||
# run pageserver tests with different settings
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
for get_vectored_concurrent_io in sequential sidecar-task; do
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
|
||||
${cov_prefix} \
|
||||
cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
done
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
@@ -314,6 +315,7 @@ jobs:
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ inputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
|
||||
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
|
||||
2
.github/workflows/build-macos.yml
vendored
2
.github/workflows/build-macos.yml
vendored
@@ -235,7 +235,7 @@ jobs:
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Run cargo build (only for v17)
|
||||
run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
|
||||
run: cargo build --all --release -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Check that no warnings are produced (only for v17)
|
||||
run: ./run_clippy.sh
|
||||
|
||||
79
.github/workflows/build_and_test.yml
vendored
79
.github/workflows/build_and_test.yml
vendored
@@ -242,7 +242,7 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, small ]
|
||||
runs-on: [ self-hosted, small-metal ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
@@ -786,6 +786,17 @@ jobs:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Get the last compute release tag
|
||||
id: get-last-compute-release-tag
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"/repos/${{ github.repository }}/releases")
|
||||
echo tag=${tag} >> ${GITHUB_OUTPUT}
|
||||
|
||||
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
|
||||
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
|
||||
# Regular pageserver version string looks like
|
||||
@@ -817,11 +828,25 @@ jobs:
|
||||
TEST_VERSION_ONLY: ${{ matrix.pg_version }}
|
||||
run: ./docker-compose/docker_compose_test.sh
|
||||
|
||||
- name: Print logs and clean up docker-compose test
|
||||
if: always()
|
||||
run: |
|
||||
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
|
||||
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
- name: Test extension upgrade
|
||||
timeout-minutes: 20
|
||||
if: ${{ needs.tag.outputs.build-tag == github.run_id }}
|
||||
env:
|
||||
NEWTAG: ${{ needs.tag.outputs.build-tag }}
|
||||
OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
|
||||
run: ./docker-compose/test_extensions_upgrade.sh
|
||||
|
||||
- name: Print logs and clean up
|
||||
if: always()
|
||||
run: |
|
||||
docker compose -f ./docker-compose/docker-compose.yml logs || 0
|
||||
docker compose -f ./docker-compose/docker-compose.yml down
|
||||
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
|
||||
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
promote-images-dev:
|
||||
needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
|
||||
@@ -859,7 +884,7 @@ jobs:
|
||||
done
|
||||
|
||||
promote-images-prod:
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
needs: [ check-permissions, tag, test-images, promote-images-dev ]
|
||||
runs-on: ubuntu-22.04
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
|
||||
@@ -1050,6 +1075,7 @@ jobs:
|
||||
retries: 5
|
||||
script: |
|
||||
const tag = "${{ needs.tag.outputs.build-tag }}";
|
||||
const branch = "${{ github.ref_name }}";
|
||||
|
||||
try {
|
||||
const existingRef = await github.rest.git.getRef({
|
||||
@@ -1078,12 +1104,6 @@ jobs:
|
||||
console.log(`Tag ${tag} created successfully.`);
|
||||
}
|
||||
|
||||
// TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
|
||||
if (context.ref !== 'refs/heads/release') {
|
||||
console.log(`GitHub release skipped for ${context.ref}.`);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const existingRelease = await github.rest.repos.getReleaseByTag({
|
||||
owner: context.repo.owner,
|
||||
@@ -1098,11 +1118,48 @@ jobs:
|
||||
}
|
||||
|
||||
console.log(`Release for tag ${tag} does not exist. Creating it...`);
|
||||
|
||||
// Find the PR number using the commit SHA
|
||||
const pullRequests = await github.rest.pulls.list({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
state: 'closed',
|
||||
base: branch,
|
||||
});
|
||||
|
||||
const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha);
|
||||
const prNumber = pr ? pr.number : null;
|
||||
|
||||
// Find the previous release on the branch
|
||||
const releases = await github.rest.repos.listReleases({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
per_page: 100,
|
||||
});
|
||||
|
||||
const branchReleases = releases.data
|
||||
.filter((release) => {
|
||||
const regex = new RegExp(`^${branch}-\\d+$`);
|
||||
return regex.test(release.tag_name) && !release.draft && !release.prerelease;
|
||||
})
|
||||
.sort((a, b) => new Date(b.created_at) - new Date(a.created_at));
|
||||
|
||||
const previousTag = branchReleases.length > 0 ? branchReleases[0].tag_name : null;
|
||||
|
||||
const releaseNotes = [
|
||||
prNumber
|
||||
? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.`
|
||||
: 'Release PR not found.',
|
||||
previousTag
|
||||
? `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${previousTag}...${tag}.`
|
||||
: `No previous release found on branch ${branch}.`,
|
||||
].join('\n\n');
|
||||
|
||||
await github.rest.repos.createRelease({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
tag_name: tag,
|
||||
generate_release_notes: true,
|
||||
body: releaseNotes,
|
||||
});
|
||||
console.log(`Release for tag ${tag} created successfully.`);
|
||||
}
|
||||
|
||||
33
.github/workflows/ingest_benchmark.yml
vendored
33
.github/workflows/ingest_benchmark.yml
vendored
@@ -28,7 +28,24 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false # allow other variants to continue even if one fails
|
||||
matrix:
|
||||
target_project: [new_empty_project, large_existing_project]
|
||||
include:
|
||||
- target_project: new_empty_project_stripe_size_2048
|
||||
stripe_size: 2048 # 16 MiB
|
||||
postgres_version: 16
|
||||
- target_project: new_empty_project_stripe_size_32768
|
||||
stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold
|
||||
# while here it is sharded from the beginning with a shard size of 256 MiB
|
||||
postgres_version: 16
|
||||
- target_project: new_empty_project
|
||||
stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
|
||||
postgres_version: 16
|
||||
- target_project: new_empty_project
|
||||
stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
|
||||
postgres_version: 17
|
||||
- target_project: large_existing_project
|
||||
stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project
|
||||
postgres_version: 16
|
||||
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
@@ -67,17 +84,21 @@ jobs:
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Create Neon Project
|
||||
if: ${{ matrix.target_project == 'new_empty_project' }}
|
||||
if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}
|
||||
id: create-neon-project-ingest-target
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: aws-us-east-2
|
||||
postgres_version: 16
|
||||
postgres_version: ${{ matrix.postgres_version }}
|
||||
compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
shard_split_project: ${{ matrix.stripe_size != null && 'true' || 'false' }}
|
||||
admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }}
|
||||
shard_count: 8
|
||||
stripe_size: ${{ matrix.stripe_size }}
|
||||
|
||||
- name: Initialize Neon project
|
||||
if: ${{ matrix.target_project == 'new_empty_project' }}
|
||||
if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}
|
||||
env:
|
||||
BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
|
||||
NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
|
||||
@@ -130,7 +151,7 @@ jobs:
|
||||
test_selection: performance/test_perf_ingest_using_pgcopydb.py
|
||||
run_in_parallel: false
|
||||
extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb
|
||||
pg_version: v16
|
||||
pg_version: v${{ matrix.postgres_version }}
|
||||
save_perf_report: true
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
@@ -146,7 +167,7 @@ jobs:
|
||||
${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+"
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ always() && matrix.target_project == 'new_empty_project' }}
|
||||
if: ${{ always() && startsWith(matrix.target_project, 'new_empty_project') }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
|
||||
|
||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -114,7 +114,7 @@ jobs:
|
||||
run: make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Produce the build stats
|
||||
run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
|
||||
run: cargo build --all --release --timings -j$(nproc)
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
|
||||
5
.github/workflows/release.yml
vendored
5
.github/workflows/release.yml
vendored
@@ -3,8 +3,9 @@ name: Create Release Branch
|
||||
on:
|
||||
schedule:
|
||||
# It should be kept in sync with if-condition in jobs
|
||||
- cron: '0 6 * * FRI' # Storage release
|
||||
- cron: '0 6 * * THU' # Proxy release
|
||||
- cron: '0 6 * * FRI' # Storage release
|
||||
- cron: '0 7 * * FRI' # Compute release
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
create-storage-release-branch:
|
||||
@@ -55,7 +56,7 @@ jobs:
|
||||
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
create-compute-release-branch:
|
||||
if: inputs.create-compute-release-branch
|
||||
if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }}
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
296
Cargo.lock
generated
296
Cargo.lock
generated
@@ -179,7 +179,7 @@ dependencies = [
|
||||
"nom",
|
||||
"num-traits",
|
||||
"rusticata-macros",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"time",
|
||||
]
|
||||
|
||||
@@ -718,14 +718,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.7.9"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
|
||||
checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum-core",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"form_urlencoded",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
@@ -733,7 +733,7 @@ dependencies = [
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"itoa",
|
||||
"matchit 0.7.0",
|
||||
"matchit",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
@@ -746,7 +746,7 @@ dependencies = [
|
||||
"sha1",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.24.0",
|
||||
"tokio-tungstenite 0.26.1",
|
||||
"tower 0.5.2",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -755,11 +755,10 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.4.5"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
|
||||
checksum = "df1362f362fd16024ae199c1970ce98f9661bf5ef94b9808fee734bc3698b733"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
@@ -942,6 +941,18 @@ version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
|
||||
|
||||
[[package]]
|
||||
name = "bb8"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures-util",
|
||||
"parking_lot 0.12.1",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bcder"
|
||||
version = "0.7.4"
|
||||
@@ -1118,7 +1129,7 @@ dependencies = [
|
||||
"log",
|
||||
"nix 0.25.1",
|
||||
"regex",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1299,9 +1310,9 @@ dependencies = [
|
||||
"serde_with",
|
||||
"signal-hook",
|
||||
"tar",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tower 0.5.2",
|
||||
@@ -1408,9 +1419,9 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"storage_broker",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-util",
|
||||
"toml",
|
||||
"toml_edit",
|
||||
@@ -1786,11 +1797,24 @@ dependencies = [
|
||||
"chrono",
|
||||
"diesel_derives",
|
||||
"itoa",
|
||||
"pq-sys",
|
||||
"r2d2",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diesel-async"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bb8",
|
||||
"diesel",
|
||||
"futures-util",
|
||||
"scoped-futures",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.12",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diesel_derives"
|
||||
version = "2.2.1"
|
||||
@@ -2239,7 +2263,7 @@ dependencies = [
|
||||
"pin-project",
|
||||
"rand 0.8.5",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
@@ -3365,12 +3389,6 @@ dependencies = [
|
||||
"regex-automata 0.1.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.8.4"
|
||||
@@ -3761,7 +3779,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"serde_path_to_error",
|
||||
"sha2",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"url",
|
||||
]
|
||||
|
||||
@@ -3811,7 +3829,7 @@ dependencies = [
|
||||
"futures-sink",
|
||||
"js-sys",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
@@ -3843,7 +3861,7 @@ dependencies = [
|
||||
"opentelemetry_sdk",
|
||||
"prost",
|
||||
"reqwest",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3879,7 +3897,7 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
"rand 0.8.5",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
@@ -3993,7 +4011,7 @@ dependencies = [
|
||||
"remote_storage",
|
||||
"serde_json",
|
||||
"svg_fmt",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"utils",
|
||||
@@ -4042,8 +4060,8 @@ dependencies = [
|
||||
"pageserver_compaction",
|
||||
"pin-project-lite",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"postgres-protocol 0.6.4",
|
||||
"postgres-types 0.2.4",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
@@ -4069,12 +4087,12 @@ dependencies = [
|
||||
"strum_macros",
|
||||
"sysinfo",
|
||||
"tenant_size_model",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-epoll-uring",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
@@ -4115,7 +4133,7 @@ dependencies = [
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"utils",
|
||||
]
|
||||
|
||||
@@ -4130,9 +4148,9 @@ dependencies = [
|
||||
"postgres",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"utils",
|
||||
@@ -4438,7 +4456,7 @@ dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4459,6 +4477,24 @@ dependencies = [
|
||||
"stringprep",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"hmac",
|
||||
"md-5",
|
||||
"memchr",
|
||||
"rand 0.8.5",
|
||||
"sha2",
|
||||
"stringprep",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-protocol2"
|
||||
version = "0.1.0"
|
||||
@@ -4482,7 +4518,18 @@ source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"postgres-protocol",
|
||||
"postgres-protocol 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f66ea23a2d0e5734297357705193335e0a957696f34bed2f2faefacb2fec336f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"postgres-protocol 0.6.7",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4505,9 +4552,9 @@ dependencies = [
|
||||
"rustls 0.23.18",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-util",
|
||||
@@ -4522,7 +4569,7 @@ dependencies = [
|
||||
"itertools 0.10.5",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"url",
|
||||
]
|
||||
|
||||
@@ -4543,7 +4590,7 @@ dependencies = [
|
||||
"pprof",
|
||||
"regex",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tracing",
|
||||
"utils",
|
||||
]
|
||||
@@ -4554,7 +4601,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"camino",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -4587,7 +4634,7 @@ dependencies = [
|
||||
"smallvec",
|
||||
"symbolic-demangle",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4609,15 +4656,6 @@ version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
||||
|
||||
[[package]]
|
||||
name = "pq-sys"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
|
||||
dependencies = [
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pq_proto"
|
||||
version = "0.1.0"
|
||||
@@ -4625,10 +4663,10 @@ dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"itertools 0.10.5",
|
||||
"postgres-protocol",
|
||||
"postgres-protocol 0.6.4",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
@@ -4699,7 +4737,7 @@ dependencies = [
|
||||
"memchr",
|
||||
"parking_lot 0.12.1",
|
||||
"procfs",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4869,11 +4907,11 @@ dependencies = [
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"subtle",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tikv-jemalloc-ctl",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres2",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-tungstenite 0.21.0",
|
||||
@@ -4930,17 +4968,6 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "r2d2"
|
||||
version = "0.8.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
|
||||
dependencies = [
|
||||
"log",
|
||||
"parking_lot 0.12.1",
|
||||
"scheduled-thread-pool",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
@@ -5277,7 +5304,7 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
@@ -5297,7 +5324,7 @@ dependencies = [
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
"retry-policies",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"wasm-timer",
|
||||
@@ -5313,7 +5340,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"getrandom 0.2.11",
|
||||
"http 1.1.0",
|
||||
"matchit 0.8.4",
|
||||
"matchit",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
@@ -5672,7 +5699,7 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"parking_lot 0.12.1",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres-protocol 0.6.4",
|
||||
"postgres_backend",
|
||||
"postgres_ffi",
|
||||
"pprof",
|
||||
@@ -5692,11 +5719,11 @@ dependencies = [
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
@@ -5731,7 +5758,7 @@ dependencies = [
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -5755,12 +5782,12 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scheduled-thread-pool"
|
||||
version = "0.2.7"
|
||||
name = "scoped-futures"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
|
||||
checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
|
||||
dependencies = [
|
||||
"parking_lot 0.12.1",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5940,7 +5967,7 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"time",
|
||||
"url",
|
||||
"uuid",
|
||||
@@ -6012,7 +6039,7 @@ checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6"
|
||||
dependencies = [
|
||||
"percent-encoding",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6174,7 +6201,7 @@ checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085"
|
||||
dependencies = [
|
||||
"num-bigint",
|
||||
"num-traits",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"time",
|
||||
]
|
||||
|
||||
@@ -6295,6 +6322,7 @@ dependencies = [
|
||||
"clap",
|
||||
"control_plane",
|
||||
"diesel",
|
||||
"diesel-async",
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
@@ -6309,16 +6337,16 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres_connection",
|
||||
"r2d2",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
"routerify",
|
||||
"scoped-futures",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
@@ -6365,7 +6393,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"storage_controller_client",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
@@ -6610,7 +6638,16 @@ version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
"thiserror-impl 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "2.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
|
||||
dependencies = [
|
||||
"thiserror-impl 2.0.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6624,6 +6661,17 @@ dependencies = [
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "2.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.1.7"
|
||||
@@ -6774,13 +6822,13 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
"once_cell",
|
||||
"scopeguard",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
@@ -6824,13 +6872,39 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
"phf",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"postgres-protocol 0.6.4",
|
||||
"postgres-types 0.2.4",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b5d3742945bc7d7f210693b0c58ae542c6fd47b17adbbda0885f3dcb34a6bdb"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"log",
|
||||
"parking_lot 0.12.1",
|
||||
"percent-encoding",
|
||||
"phf",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol 0.6.7",
|
||||
"postgres-types 0.2.8",
|
||||
"rand 0.8.5",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"whoami",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres-rustls"
|
||||
version = "0.12.0"
|
||||
@@ -6840,7 +6914,7 @@ dependencies = [
|
||||
"ring",
|
||||
"rustls 0.23.18",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-rustls 0.26.0",
|
||||
"x509-certificate",
|
||||
]
|
||||
@@ -6937,14 +7011,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tungstenite"
|
||||
version = "0.24.0"
|
||||
version = "0.26.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9"
|
||||
checksum = "be4bf6fecd69fcdede0ec680aaf474cdab988f9de6bc73d3758f0160e3b7025a"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.24.0",
|
||||
"tungstenite 0.26.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7004,12 +7078,9 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"axum",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"h2 0.4.4",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
@@ -7021,7 +7092,6 @@ dependencies = [
|
||||
"prost",
|
||||
"rustls-native-certs 0.8.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-stream",
|
||||
@@ -7258,16 +7328,16 @@ dependencies = [
|
||||
"log",
|
||||
"rand 0.8.5",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"url",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
version = "0.24.0"
|
||||
version = "0.26.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a"
|
||||
checksum = "413083a99c579593656008130e29255e54dcaae495be556cc26888f211648c24"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
@@ -7277,7 +7347,7 @@ dependencies = [
|
||||
"log",
|
||||
"rand 0.8.5",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"thiserror 2.0.11",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
@@ -7373,7 +7443,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"io-uring",
|
||||
@@ -7472,7 +7542,7 @@ dependencies = [
|
||||
"signal-hook",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
@@ -7502,12 +7572,6 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.4"
|
||||
@@ -7527,7 +7591,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"sysinfo",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
@@ -7578,11 +7642,10 @@ dependencies = [
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"tracing",
|
||||
"utils",
|
||||
@@ -7991,8 +8054,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum-core",
|
||||
"base64 0.13.1",
|
||||
"base64 0.21.1",
|
||||
"base64ct",
|
||||
@@ -8073,7 +8134,6 @@ dependencies = [
|
||||
"toml_edit",
|
||||
"tonic",
|
||||
"tower 0.4.13",
|
||||
"tower 0.5.2",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"url",
|
||||
@@ -8111,7 +8171,7 @@ dependencies = [
|
||||
"ring",
|
||||
"signature 2.2.0",
|
||||
"spki 0.7.3",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
@@ -8128,7 +8188,7 @@ dependencies = [
|
||||
"nom",
|
||||
"oid-registry",
|
||||
"rusticata-macros",
|
||||
"thiserror",
|
||||
"thiserror 1.0.69",
|
||||
"time",
|
||||
]
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ aws-smithy-types = "1.2"
|
||||
aws-credential-types = "1.2.0"
|
||||
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
|
||||
aws-types = "1.3"
|
||||
axum = { version = "0.7.9", features = ["ws"] }
|
||||
axum = { version = "0.8.1", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.70"
|
||||
@@ -187,7 +187,7 @@ tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
|
||||
tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
|
||||
tower = { version = "0.5.2", default-features = false }
|
||||
tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
|
||||
tower-service = "0.3.3"
|
||||
|
||||
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .
|
||||
|
||||
ARG ADDITIONAL_RUSTFLAGS
|
||||
RUN set -e \
|
||||
&& PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
|
||||
--bin pg_sni_router \
|
||||
--bin pageserver \
|
||||
--bin pagectl \
|
||||
|
||||
2
Makefile
2
Makefile
@@ -64,8 +64,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
|
||||
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
||||
# Force cargo not to print progress bar
|
||||
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
|
||||
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
|
||||
|
||||
CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
|
||||
|
||||
|
||||
@@ -21,8 +21,10 @@ The Neon storage engine consists of two major components:
|
||||
|
||||
See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.
|
||||
|
||||
## Running local installation
|
||||
## Running a local development environment
|
||||
|
||||
Neon can be run on a workstation for small experiments and to test code changes, by
|
||||
following these instructions.
|
||||
|
||||
#### Installing dependencies on Linux
|
||||
1. Install build dependencies and other applicable packages
|
||||
@@ -238,7 +240,7 @@ postgres=# select * from t;
|
||||
> cargo neon stop
|
||||
```
|
||||
|
||||
More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
|
||||
More advanced usages can be found at [Local Development Control Plane (`neon_local`))](./control_plane/README.md).
|
||||
|
||||
#### Handling build failures
|
||||
|
||||
|
||||
@@ -67,6 +67,9 @@ RUN cd postgres && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \
|
||||
file=/usr/local/pgsql/share/extension/postgres_fdw--1.0.sql && [ -e $file ] && \
|
||||
echo 'GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO neon_superuser;' >> $file && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
||||
@@ -360,6 +363,8 @@ COPY compute/patches/pgvector.patch /pgvector.patch
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \
|
||||
echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
|
||||
@@ -1261,11 +1266,12 @@ RUN set -e \
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layers "postgres-exporter" and "sql-exporter"
|
||||
# Layers "postgres-exporter", "pgbouncer-exporter", and "sql-exporter"
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
|
||||
FROM quay.io/prometheuscommunity/pgbouncer-exporter:v0.10.2 AS pgbouncer-exporter
|
||||
|
||||
# Keep the version the same as in build-tools.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
@@ -1347,9 +1353,6 @@ COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
|
||||
COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
|
||||
#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
|
||||
#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
|
||||
#pg_anon is not supported yet for pg v17 so, don't fail if nothing found
|
||||
COPY --from=pg-anon-pg-build /pg_anon.tar.g? /ext-src
|
||||
COPY compute/patches/pg_anon.patch /ext-src
|
||||
COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
|
||||
COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
|
||||
RUN cd /ext-src/ && for f in *.tar.gz; \
|
||||
@@ -1360,9 +1363,6 @@ RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
||||
RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
|
||||
esac && patch -p1 </ext-src/pg_anon.patch
|
||||
RUN patch -p1 </ext-src/pg_cron.patch
|
||||
ENV PATH=/usr/local/pgsql/bin:$PATH
|
||||
ENV PGHOST=compute
|
||||
@@ -1403,6 +1403,7 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
|
||||
|
||||
# Metrics exporter binaries and configuration files
|
||||
COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
|
||||
COPY --from=pgbouncer-exporter /bin/pgbouncer_exporter /bin/pgbouncer_exporter
|
||||
COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter
|
||||
|
||||
COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
|
||||
|
||||
@@ -19,6 +19,8 @@ max_prepared_statements=0
|
||||
admin_users=postgres
|
||||
unix_socket_dir=/tmp/
|
||||
unix_socket_mode=0777
|
||||
; required for pgbouncer_exporter
|
||||
ignore_startup_parameters=extra_float_digits
|
||||
|
||||
;; Disable connection logging. It produces a lot of logs that no one looks at,
|
||||
;; and we can get similar log entries from the proxy too. We had incidents in
|
||||
|
||||
@@ -1,8 +1,24 @@
|
||||
diff --git a/Makefile b/Makefile
|
||||
index 7a4b88c..56678af 100644
|
||||
--- a/Makefile
|
||||
+++ b/Makefile
|
||||
@@ -3,7 +3,10 @@ EXTVERSION = 0.8.0
|
||||
|
||||
MODULE_big = vector
|
||||
DATA = $(wildcard sql/*--*--*.sql)
|
||||
-DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql
|
||||
+# This change is needed to install different per-version SQL files
|
||||
+# like pgvector--0.8.0.sql and pgvector--0.7.4.sql
|
||||
+# The corresponding file is downloaded during the Docker image build process
|
||||
+DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql sql/vector--0.7.4.sql
|
||||
OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o
|
||||
HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
|
||||
|
||||
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
|
||||
index dcfb2bd..d5189ee 100644
|
||||
index b667478..fc1897c 100644
|
||||
--- a/src/hnswbuild.c
|
||||
+++ b/src/hnswbuild.c
|
||||
@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
|
||||
@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
|
||||
|
||||
hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
|
||||
|
||||
@@ -20,7 +36,7 @@ index dcfb2bd..d5189ee 100644
|
||||
/* Close relations within worker */
|
||||
index_close(indexRel, indexLockmode);
|
||||
table_close(heapRel, heapLockmode);
|
||||
@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
|
||||
@@ -1100,12 +1108,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
|
||||
SeedRandom(42);
|
||||
#endif
|
||||
|
||||
|
||||
@@ -27,6 +27,10 @@ commands:
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
|
||||
- name: pgbouncer-exporter
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"'
|
||||
- name: sql-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -27,6 +27,10 @@ commands:
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
|
||||
- name: pgbouncer-exporter
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"'
|
||||
- name: sql-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -58,6 +58,8 @@ struct Args {
|
||||
pg_bin_dir: Utf8PathBuf,
|
||||
#[clap(long)]
|
||||
pg_lib_dir: Utf8PathBuf,
|
||||
#[clap(long)]
|
||||
pg_port: Option<u16>, // port to run postgres on, 5432 is default
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
@@ -74,6 +76,13 @@ enum EncryptionSecret {
|
||||
KMS { key_id: String },
|
||||
}
|
||||
|
||||
// copied from pageserver_api::config::defaults::DEFAULT_LOCALE to avoid dependency just for a constant
|
||||
const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
|
||||
"C"
|
||||
} else {
|
||||
"C.UTF-8"
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
utils::logging::init(
|
||||
@@ -97,6 +106,10 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
let working_directory = args.working_directory;
|
||||
let pg_bin_dir = args.pg_bin_dir;
|
||||
let pg_lib_dir = args.pg_lib_dir;
|
||||
let pg_port = args.pg_port.unwrap_or_else(|| {
|
||||
info!("pg_port not specified, using default 5432");
|
||||
5432
|
||||
});
|
||||
|
||||
// Initialize AWS clients only if s3_prefix is specified
|
||||
let (aws_config, kms_client) = if args.s3_prefix.is_some() {
|
||||
@@ -180,7 +193,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
|
||||
postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
|
||||
superuser,
|
||||
locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded,
|
||||
locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
|
||||
pg_version,
|
||||
initdb_bin: pg_bin_dir.join("initdb").as_ref(),
|
||||
library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
|
||||
@@ -197,6 +210,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
let mut postgres_proc = tokio::process::Command::new(pgbin)
|
||||
.arg("-D")
|
||||
.arg(&pgdata_dir)
|
||||
.args(["-p", &format!("{pg_port}")])
|
||||
.args(["-c", "wal_level=minimal"])
|
||||
.args(["-c", "shared_buffers=10GB"])
|
||||
.args(["-c", "max_wal_senders=0"])
|
||||
@@ -216,6 +230,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir)
|
||||
.stdout(std::process::Stdio::piped())
|
||||
.stderr(std::process::Stdio::piped())
|
||||
.spawn()
|
||||
@@ -232,7 +247,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
|
||||
// Create neondb database in the running postgres
|
||||
let restore_pg_connstring =
|
||||
format!("host=localhost port=5432 user={superuser} dbname=postgres");
|
||||
format!("host=localhost port={pg_port} user={superuser} dbname=postgres");
|
||||
|
||||
let start_time = std::time::Instant::now();
|
||||
|
||||
@@ -314,6 +329,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.arg(&source_connection_string)
|
||||
// how we run it
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir)
|
||||
.kill_on_drop(true)
|
||||
.stdout(std::process::Stdio::piped())
|
||||
.stderr(std::process::Stdio::piped())
|
||||
@@ -347,6 +363,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.arg(&dumpdir)
|
||||
// how we run it
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir)
|
||||
.kill_on_drop(true)
|
||||
.stdout(std::process::Stdio::piped())
|
||||
.stderr(std::process::Stdio::piped())
|
||||
|
||||
@@ -41,14 +41,14 @@ use crate::local_proxy;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
use crate::spec_apply::ApplySpecPhase::{
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSuperUser,
|
||||
DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions,
|
||||
RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
|
||||
CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
|
||||
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
|
||||
RunInEachDatabase,
|
||||
};
|
||||
use crate::spec_apply::PerDatabasePhase;
|
||||
use crate::spec_apply::PerDatabasePhase::{
|
||||
ChangeSchemaPerms, DeleteDBRoleReferences, DropSubscriptionsForDeletedDatabases,
|
||||
HandleAnonExtension,
|
||||
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
|
||||
};
|
||||
use crate::spec_apply::{apply_operations, MutableApplyContext, DB};
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
@@ -340,6 +340,15 @@ impl ComputeNode {
|
||||
self.state.lock().unwrap().status
|
||||
}
|
||||
|
||||
pub fn get_timeline_id(&self) -> Option<TimelineId> {
|
||||
self.state
|
||||
.lock()
|
||||
.unwrap()
|
||||
.pspec
|
||||
.as_ref()
|
||||
.map(|s| s.timeline_id)
|
||||
}
|
||||
|
||||
// Remove `pgdata` directory and create it again with right permissions.
|
||||
fn create_pgdata(&self) -> Result<()> {
|
||||
// Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
|
||||
@@ -929,6 +938,48 @@ impl ComputeNode {
|
||||
.map(|role| (role.name.clone(), role))
|
||||
.collect::<HashMap<String, Role>>();
|
||||
|
||||
// Check if we need to drop subscriptions before starting the endpoint.
|
||||
//
|
||||
// It is important to do this operation exactly once when endpoint starts on a new branch.
|
||||
// Otherwise, we may drop not inherited, but newly created subscriptions.
|
||||
//
|
||||
// We cannot rely only on spec.drop_subscriptions_before_start flag,
|
||||
// because if for some reason compute restarts inside VM,
|
||||
// it will start again with the same spec and flag value.
|
||||
//
|
||||
// To handle this, we save the fact of the operation in the database
|
||||
// in the neon.drop_subscriptions_done table.
|
||||
// If the table does not exist, we assume that the operation was never performed, so we must do it.
|
||||
// If table exists, we check if the operation was performed on the current timelilne.
|
||||
//
|
||||
let mut drop_subscriptions_done = false;
|
||||
|
||||
if spec.drop_subscriptions_before_start {
|
||||
let timeline_id = self.get_timeline_id().context("timeline_id must be set")?;
|
||||
let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id);
|
||||
|
||||
info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
|
||||
|
||||
drop_subscriptions_done = match
|
||||
client.simple_query(&query).await {
|
||||
Ok(result) => {
|
||||
matches!(&result[0], postgres::SimpleQueryMessage::Row(_))
|
||||
},
|
||||
Err(e) =>
|
||||
{
|
||||
match e.code() {
|
||||
Some(&SqlState::UNDEFINED_TABLE) => false,
|
||||
_ => {
|
||||
// We don't expect any other error here, except for the schema/table not existing
|
||||
error!("Error checking if drop subscription operation was already performed: {}", e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
let jwks_roles = Arc::new(
|
||||
spec.as_ref()
|
||||
.local_proxy_config
|
||||
@@ -996,7 +1047,7 @@ impl ComputeNode {
|
||||
jwks_roles.clone(),
|
||||
concurrency_token.clone(),
|
||||
db,
|
||||
[DropSubscriptionsForDeletedDatabases].to_vec(),
|
||||
[DropLogicalSubscriptions].to_vec(),
|
||||
);
|
||||
|
||||
Ok(spawn(fut))
|
||||
@@ -1024,6 +1075,7 @@ impl ComputeNode {
|
||||
CreateAndAlterRoles,
|
||||
RenameAndDeleteDatabases,
|
||||
CreateAndAlterDatabases,
|
||||
CreateSchemaNeon,
|
||||
] {
|
||||
info!("Applying phase {:?}", &phase);
|
||||
apply_operations(
|
||||
@@ -1064,6 +1116,17 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
let conf = Arc::new(conf);
|
||||
let mut phases = vec![
|
||||
DeleteDBRoleReferences,
|
||||
ChangeSchemaPerms,
|
||||
HandleAnonExtension,
|
||||
];
|
||||
|
||||
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
|
||||
info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
|
||||
phases.push(DropLogicalSubscriptions);
|
||||
}
|
||||
|
||||
let fut = Self::apply_spec_sql_db(
|
||||
spec.clone(),
|
||||
conf,
|
||||
@@ -1071,12 +1134,7 @@ impl ComputeNode {
|
||||
jwks_roles.clone(),
|
||||
concurrency_token.clone(),
|
||||
db,
|
||||
[
|
||||
DeleteDBRoleReferences,
|
||||
ChangeSchemaPerms,
|
||||
HandleAnonExtension,
|
||||
]
|
||||
.to_vec(),
|
||||
phases,
|
||||
);
|
||||
|
||||
Ok(spawn(fut))
|
||||
@@ -1088,12 +1146,20 @@ impl ComputeNode {
|
||||
handle.await??;
|
||||
}
|
||||
|
||||
for phase in vec![
|
||||
let mut phases = vec![
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension,
|
||||
HandleNeonExtension, // This step depends on CreateSchemaNeon
|
||||
CreateAvailabilityCheck,
|
||||
DropRoles,
|
||||
] {
|
||||
];
|
||||
|
||||
// This step depends on CreateSchemaNeon
|
||||
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
|
||||
info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
|
||||
phases.push(FinalizeDropLogicalSubscriptions);
|
||||
}
|
||||
|
||||
for phase in phases {
|
||||
debug!("Applying phase {:?}", &phase);
|
||||
apply_operations(
|
||||
spec.clone(),
|
||||
@@ -1463,6 +1529,14 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
|
||||
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
||||
if config::line_in_file(
|
||||
&postgresql_conf_path,
|
||||
"neon.disable_logical_replication_subscribers=false",
|
||||
)? {
|
||||
info!("updated postgresql.conf to set neon.disable_logical_replication_subscribers=false");
|
||||
}
|
||||
self.pg_reload_conf()?;
|
||||
}
|
||||
self.post_apply_config()?;
|
||||
|
||||
@@ -129,6 +129,13 @@ pub fn write_postgres_conf(
|
||||
|
||||
writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
|
||||
|
||||
if spec.drop_subscriptions_before_start {
|
||||
writeln!(file, "neon.disable_logical_replication_subscribers=true")?;
|
||||
} else {
|
||||
// be explicit about the default value
|
||||
writeln!(file, "neon.disable_logical_replication_subscribers=false")?;
|
||||
}
|
||||
|
||||
// This is essential to keep this line at the end of the file,
|
||||
// because it is intended to override any settings above.
|
||||
writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
|
||||
|
||||
@@ -85,6 +85,8 @@ use tracing::info;
|
||||
use tracing::log::warn;
|
||||
use zstd::stream::read::Decoder;
|
||||
|
||||
use crate::metrics::{REMOTE_EXT_REQUESTS_FAILED, REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
|
||||
|
||||
fn get_pg_config(argument: &str, pgbin: &str) -> String {
|
||||
// gives the result of `pg_config [argument]`
|
||||
// where argument is a flag like `--version` or `--sharedir`
|
||||
@@ -258,21 +260,60 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
|
||||
|
||||
info!("Download extension {:?} from uri {:?}", ext_path, uri);
|
||||
|
||||
let resp = reqwest::get(uri).await?;
|
||||
REMOTE_EXT_REQUESTS_TOTAL.with_label_values(&[]).inc();
|
||||
|
||||
match resp.status() {
|
||||
match do_extension_server_request(&uri).await {
|
||||
Ok(resp) => {
|
||||
info!(
|
||||
"Successfully downloaded remote extension data {:?}",
|
||||
ext_path
|
||||
);
|
||||
Ok(resp)
|
||||
}
|
||||
Err((msg, status)) => {
|
||||
let status_str = status
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
|
||||
REMOTE_EXT_REQUESTS_FAILED
|
||||
.with_label_values(&[&status_str])
|
||||
.inc();
|
||||
bail!(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Do a single remote extensions server request.
|
||||
// Return result or (error message + status code) in case of any failures.
|
||||
async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, Option<StatusCode>)> {
|
||||
let resp = reqwest::get(uri).await.map_err(|e| {
|
||||
(
|
||||
format!("could not perform remote extensions server request: {}", e),
|
||||
None,
|
||||
)
|
||||
})?;
|
||||
let status = resp.status();
|
||||
|
||||
match status {
|
||||
StatusCode::OK => match resp.bytes().await {
|
||||
Ok(resp) => {
|
||||
info!("Download extension {:?} completed successfully", ext_path);
|
||||
Ok(resp)
|
||||
}
|
||||
Err(e) => bail!("could not deserialize remote extension response: {}", e),
|
||||
Ok(resp) => Ok(resp),
|
||||
Err(e) => Err((
|
||||
format!("could not read remote extensions server response: {}", e),
|
||||
// It's fine to return and report error with status as 200 OK,
|
||||
// because we still failed to read the response.
|
||||
Some(status),
|
||||
)),
|
||||
},
|
||||
StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
|
||||
_ => bail!(
|
||||
"unexpected remote extension response status code: {}",
|
||||
resp.status()
|
||||
),
|
||||
StatusCode::SERVICE_UNAVAILABLE => Err((
|
||||
"remote extensions server is temporarily unavailable".to_string(),
|
||||
Some(status),
|
||||
)),
|
||||
_ => Err((
|
||||
format!(
|
||||
"unexpected remote extensions server response status code: {}",
|
||||
status
|
||||
),
|
||||
Some(status),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use axum::{
|
||||
async_trait,
|
||||
extract::{rejection::JsonRejection, FromRequest, Request},
|
||||
};
|
||||
use axum::extract::{rejection::JsonRejection, FromRequest, Request};
|
||||
use compute_api::responses::GenericAPIError;
|
||||
use http::StatusCode;
|
||||
|
||||
@@ -12,7 +9,6 @@ use http::StatusCode;
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub(crate) struct Json<T>(pub T);
|
||||
|
||||
#[async_trait]
|
||||
impl<S, T> FromRequest<S> for Json<T>
|
||||
where
|
||||
axum::Json<T>: FromRequest<S, Rejection = JsonRejection>,
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use axum::{
|
||||
async_trait,
|
||||
extract::{rejection::PathRejection, FromRequestParts},
|
||||
};
|
||||
use axum::extract::{rejection::PathRejection, FromRequestParts};
|
||||
use compute_api::responses::GenericAPIError;
|
||||
use http::{request::Parts, StatusCode};
|
||||
|
||||
@@ -12,7 +9,6 @@ use http::{request::Parts, StatusCode};
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub(crate) struct Path<T>(pub T);
|
||||
|
||||
#[async_trait]
|
||||
impl<S, T> FromRequestParts<S> for Path<T>
|
||||
where
|
||||
axum::extract::Path<T>: FromRequestParts<S, Rejection = PathRejection>,
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use axum::{
|
||||
async_trait,
|
||||
extract::{rejection::QueryRejection, FromRequestParts},
|
||||
};
|
||||
use axum::extract::{rejection::QueryRejection, FromRequestParts};
|
||||
use compute_api::responses::GenericAPIError;
|
||||
use http::{request::Parts, StatusCode};
|
||||
|
||||
@@ -12,7 +9,6 @@ use http::{request::Parts, StatusCode};
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub(crate) struct Query<T>(pub T);
|
||||
|
||||
#[async_trait]
|
||||
impl<S, T> FromRequestParts<S> for Query<T>
|
||||
where
|
||||
axum::extract::Query<T>: FromRequestParts<S, Rejection = QueryRejection>,
|
||||
|
||||
@@ -2,17 +2,16 @@ use axum::{body::Body, response::Response};
|
||||
use http::header::CONTENT_TYPE;
|
||||
use http::StatusCode;
|
||||
use metrics::proto::MetricFamily;
|
||||
use metrics::Encoder;
|
||||
use metrics::TextEncoder;
|
||||
use metrics::{Encoder, TextEncoder};
|
||||
|
||||
use crate::{http::JsonResponse, installed_extensions};
|
||||
use crate::{http::JsonResponse, metrics::collect};
|
||||
|
||||
/// Expose Prometheus metrics.
|
||||
pub(in crate::http) async fn get_metrics() -> Response {
|
||||
// When we call TextEncoder::encode() below, it will immediately return an
|
||||
// error if a metric family has no metrics, so we need to preemptively
|
||||
// filter out metric families with no metrics.
|
||||
let metrics = installed_extensions::collect()
|
||||
let metrics = collect()
|
||||
.into_iter()
|
||||
.filter(|m| !m.get_metric().is_empty())
|
||||
.collect::<Vec<MetricFamily>>();
|
||||
|
||||
@@ -55,7 +55,7 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
|
||||
.route(
|
||||
"/extension_server/*filename",
|
||||
"/extension_server/{*filename}",
|
||||
post(extension_server::download_extension),
|
||||
)
|
||||
.route("/extensions", post(extensions::install_extension))
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
use compute_api::responses::{InstalledExtension, InstalledExtensions};
|
||||
use metrics::proto::MetricFamily;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::Result;
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use metrics::core::Collector;
|
||||
use metrics::{register_uint_gauge_vec, UIntGaugeVec};
|
||||
use once_cell::sync::Lazy;
|
||||
use crate::metrics::INSTALLED_EXTENSIONS;
|
||||
|
||||
/// We don't reuse get_existing_dbs() just for code clarity
|
||||
/// and to make database listing query here more explicit.
|
||||
@@ -102,16 +99,3 @@ pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<In
|
||||
extensions: extensions_map.into_values().collect(),
|
||||
})
|
||||
}
|
||||
|
||||
static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"compute_installed_extensions",
|
||||
"Number of databases where the version of extension is installed",
|
||||
&["extension_name", "version", "owned_by_superuser"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub fn collect() -> Vec<MetricFamily> {
|
||||
INSTALLED_EXTENSIONS.collect()
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ pub mod extension_server;
|
||||
pub mod installed_extensions;
|
||||
pub mod local_proxy;
|
||||
pub mod lsn_lease;
|
||||
pub mod metrics;
|
||||
mod migration;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
|
||||
90
compute_tools/src/metrics.rs
Normal file
90
compute_tools/src/metrics.rs
Normal file
@@ -0,0 +1,90 @@
|
||||
use metrics::core::Collector;
|
||||
use metrics::proto::MetricFamily;
|
||||
use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"compute_installed_extensions",
|
||||
"Number of databases where the version of extension is installed",
|
||||
&["extension_name", "version", "owned_by_superuser"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Normally, any HTTP API request is described by METHOD (e.g. GET, POST, etc.) + PATH,
|
||||
// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
|
||||
// And it's fair to call it a 'RPC' (Remote Procedure Call).
|
||||
pub enum CPlaneRequestRPC {
|
||||
GetSpec,
|
||||
}
|
||||
|
||||
impl CPlaneRequestRPC {
|
||||
pub fn as_str(&self) -> &str {
|
||||
match self {
|
||||
CPlaneRequestRPC::GetSpec => "GetSpec",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub const UNKNOWN_HTTP_STATUS: &str = "unknown";
|
||||
|
||||
pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"compute_ctl_cplane_requests_total",
|
||||
"Total number of control plane requests made by compute_ctl",
|
||||
&["rpc"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static CPLANE_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"compute_ctl_cplane_requests_failed_total",
|
||||
"Total number of failed control plane requests made by compute_ctl",
|
||||
&["rpc", "http_status"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Total number of failed database migrations. Per-compute, this is actually a boolean metric,
|
||||
/// either empty or with a single value (1, migration_id) because we stop at the first failure.
|
||||
/// Yet, the sum over the fleet will provide the total number of failures.
|
||||
pub(crate) static DB_MIGRATION_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"compute_ctl_db_migration_failed_total",
|
||||
"Total number of failed database migrations",
|
||||
&["migration_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"compute_ctl_remote_ext_requests_total",
|
||||
"Total number of requests made by compute_ctl to download extensions from S3 proxy",
|
||||
// Do not use any labels like extension name yet.
|
||||
// We can add them later if needed.
|
||||
&[]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static REMOTE_EXT_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"compute_ctl_remote_ext_requests_failed_total",
|
||||
"Total number of failed requests to S3 proxy",
|
||||
&["http_status"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub fn collect() -> Vec<MetricFamily> {
|
||||
let mut metrics = INSTALLED_EXTENSIONS.collect();
|
||||
metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
|
||||
metrics.extend(CPLANE_REQUESTS_FAILED.collect());
|
||||
metrics.extend(DB_MIGRATION_FAILED.collect());
|
||||
metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
|
||||
metrics.extend(REMOTE_EXT_REQUESTS_FAILED.collect());
|
||||
metrics
|
||||
}
|
||||
@@ -1,7 +1,9 @@
|
||||
use anyhow::{Context, Result};
|
||||
use fail::fail_point;
|
||||
use postgres::{Client, Transaction};
|
||||
use tracing::info;
|
||||
use tracing::{error, info};
|
||||
|
||||
use crate::metrics::DB_MIGRATION_FAILED;
|
||||
|
||||
/// Runs a series of migrations on a target database
|
||||
pub(crate) struct MigrationRunner<'m> {
|
||||
@@ -78,24 +80,31 @@ impl<'m> MigrationRunner<'m> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run an individual migration
|
||||
fn run_migration(txn: &mut Transaction, migration_id: i64, migration: &str) -> Result<()> {
|
||||
/// Run an individual migration in a separate transaction block.
|
||||
fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> {
|
||||
let mut txn = client
|
||||
.transaction()
|
||||
.with_context(|| format!("begin transaction for migration {migration_id}"))?;
|
||||
|
||||
if migration.starts_with("-- SKIP") {
|
||||
info!("Skipping migration id={}", migration_id);
|
||||
|
||||
// Even though we are skipping the migration, updating the
|
||||
// migration ID should help keep logic easy to understand when
|
||||
// trying to understand the state of a cluster.
|
||||
Self::update_migration_id(txn, migration_id)?;
|
||||
Self::update_migration_id(&mut txn, migration_id)?;
|
||||
} else {
|
||||
info!("Running migration id={}:\n{}\n", migration_id, migration);
|
||||
|
||||
txn.simple_query(migration)
|
||||
.with_context(|| format!("apply migration {migration_id}"))?;
|
||||
|
||||
Self::update_migration_id(txn, migration_id)?;
|
||||
Self::update_migration_id(&mut txn, migration_id)?;
|
||||
}
|
||||
|
||||
txn.commit()
|
||||
.with_context(|| format!("commit transaction for migration {migration_id}"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -109,19 +118,20 @@ impl<'m> MigrationRunner<'m> {
|
||||
// The index lags the migration ID by 1, so the current migration
|
||||
// ID is also the next index
|
||||
let migration_id = (current_migration + 1) as i64;
|
||||
let migration = self.migrations[current_migration];
|
||||
|
||||
let mut txn = self
|
||||
.client
|
||||
.transaction()
|
||||
.with_context(|| format!("begin transaction for migration {migration_id}"))?;
|
||||
|
||||
Self::run_migration(&mut txn, migration_id, self.migrations[current_migration])
|
||||
.with_context(|| format!("running migration {migration_id}"))?;
|
||||
|
||||
txn.commit()
|
||||
.with_context(|| format!("commit transaction for migration {migration_id}"))?;
|
||||
|
||||
info!("Finished migration id={}", migration_id);
|
||||
match Self::run_migration(self.client, migration_id, migration) {
|
||||
Ok(_) => {
|
||||
info!("Finished migration id={}", migration_id);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to run migration id={}: {}", migration_id, e);
|
||||
DB_MIGRATION_FAILED
|
||||
.with_label_values(&[migration_id.to_string().as_str()])
|
||||
.inc();
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
current_migration += 1;
|
||||
}
|
||||
|
||||
@@ -6,6 +6,9 @@ use std::path::Path;
|
||||
use tracing::{error, info, instrument, warn};
|
||||
|
||||
use crate::config;
|
||||
use crate::metrics::{
|
||||
CPlaneRequestRPC, CPLANE_REQUESTS_FAILED, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS,
|
||||
};
|
||||
use crate::migration::MigrationRunner;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
@@ -19,7 +22,7 @@ use compute_api::spec::ComputeSpec;
|
||||
fn do_control_plane_request(
|
||||
uri: &str,
|
||||
jwt: &str,
|
||||
) -> Result<ControlPlaneSpecResponse, (bool, String)> {
|
||||
) -> Result<ControlPlaneSpecResponse, (bool, String, Option<StatusCode>)> {
|
||||
let resp = reqwest::blocking::Client::new()
|
||||
.get(uri)
|
||||
.header("Authorization", format!("Bearer {}", jwt))
|
||||
@@ -28,34 +31,41 @@ fn do_control_plane_request(
|
||||
(
|
||||
true,
|
||||
format!("could not perform spec request to control plane: {}", e),
|
||||
None,
|
||||
)
|
||||
})?;
|
||||
|
||||
match resp.status() {
|
||||
let status = resp.status();
|
||||
match status {
|
||||
StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
|
||||
Ok(spec_resp) => Ok(spec_resp),
|
||||
Err(e) => Err((
|
||||
true,
|
||||
format!("could not deserialize control plane response: {}", e),
|
||||
Some(status),
|
||||
)),
|
||||
},
|
||||
StatusCode::SERVICE_UNAVAILABLE => {
|
||||
Err((true, "control plane is temporarily unavailable".to_string()))
|
||||
}
|
||||
StatusCode::SERVICE_UNAVAILABLE => Err((
|
||||
true,
|
||||
"control plane is temporarily unavailable".to_string(),
|
||||
Some(status),
|
||||
)),
|
||||
StatusCode::BAD_GATEWAY => {
|
||||
// We have a problem with intermittent 502 errors now
|
||||
// https://github.com/neondatabase/cloud/issues/2353
|
||||
// It's fine to retry GET request in this case.
|
||||
Err((true, "control plane request failed with 502".to_string()))
|
||||
Err((
|
||||
true,
|
||||
"control plane request failed with 502".to_string(),
|
||||
Some(status),
|
||||
))
|
||||
}
|
||||
// Another code, likely 500 or 404, means that compute is unknown to the control plane
|
||||
// or some internal failure happened. Doesn't make much sense to retry in this case.
|
||||
_ => Err((
|
||||
false,
|
||||
format!(
|
||||
"unexpected control plane response status code: {}",
|
||||
resp.status()
|
||||
),
|
||||
format!("unexpected control plane response status code: {}", status),
|
||||
Some(status),
|
||||
)),
|
||||
}
|
||||
}
|
||||
@@ -82,6 +92,9 @@ pub fn get_spec_from_control_plane(
|
||||
// - no spec for compute yet (Empty state) -> return Ok(None)
|
||||
// - got spec -> return Ok(Some(spec))
|
||||
while attempt < 4 {
|
||||
CPLANE_REQUESTS_TOTAL
|
||||
.with_label_values(&[CPlaneRequestRPC::GetSpec.as_str()])
|
||||
.inc();
|
||||
spec = match do_control_plane_request(&cp_uri, &jwt) {
|
||||
Ok(spec_resp) => match spec_resp.status {
|
||||
ControlPlaneComputeStatus::Empty => Ok(None),
|
||||
@@ -93,7 +106,13 @@ pub fn get_spec_from_control_plane(
|
||||
}
|
||||
}
|
||||
},
|
||||
Err((retry, msg)) => {
|
||||
Err((retry, msg, status)) => {
|
||||
let status_str = status
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
|
||||
CPLANE_REQUESTS_FAILED
|
||||
.with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status_str])
|
||||
.inc();
|
||||
if retry {
|
||||
Err(anyhow!(msg))
|
||||
} else {
|
||||
|
||||
@@ -47,7 +47,7 @@ pub enum PerDatabasePhase {
|
||||
DeleteDBRoleReferences,
|
||||
ChangeSchemaPerms,
|
||||
HandleAnonExtension,
|
||||
DropSubscriptionsForDeletedDatabases,
|
||||
DropLogicalSubscriptions,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -58,11 +58,13 @@ pub enum ApplySpecPhase {
|
||||
CreateAndAlterRoles,
|
||||
RenameAndDeleteDatabases,
|
||||
CreateAndAlterDatabases,
|
||||
CreateSchemaNeon,
|
||||
RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension,
|
||||
CreateAvailabilityCheck,
|
||||
DropRoles,
|
||||
FinalizeDropLogicalSubscriptions,
|
||||
}
|
||||
|
||||
pub struct Operation {
|
||||
@@ -331,7 +333,7 @@ async fn get_operations<'a>(
|
||||
// NB: there could be other db states, which prevent us from dropping
|
||||
// the database. For example, if db is used by any active subscription
|
||||
// or replication slot.
|
||||
// Such cases are handled in the DropSubscriptionsForDeletedDatabases
|
||||
// Such cases are handled in the DropLogicalSubscriptions
|
||||
// phase. We do all the cleanup before actually dropping the database.
|
||||
let drop_db_query: String = format!(
|
||||
"DROP DATABASE IF EXISTS {} WITH (FORCE)",
|
||||
@@ -442,13 +444,19 @@ async fn get_operations<'a>(
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
ApplySpecPhase::CreateSchemaNeon => Ok(Box::new(once(Operation {
|
||||
query: String::from("CREATE SCHEMA IF NOT EXISTS neon"),
|
||||
comment: Some(String::from(
|
||||
"create schema for neon extension and utils tables",
|
||||
)),
|
||||
}))),
|
||||
ApplySpecPhase::RunInEachDatabase { db, subphase } => {
|
||||
match subphase {
|
||||
PerDatabasePhase::DropSubscriptionsForDeletedDatabases => {
|
||||
PerDatabasePhase::DropLogicalSubscriptions => {
|
||||
match &db {
|
||||
DB::UserDB(db) => {
|
||||
let drop_subscription_query: String = format!(
|
||||
include_str!("sql/drop_subscription_for_drop_dbs.sql"),
|
||||
include_str!("sql/drop_subscriptions.sql"),
|
||||
datname_str = escape_literal(&db.name),
|
||||
);
|
||||
|
||||
@@ -666,10 +674,6 @@ async fn get_operations<'a>(
|
||||
}
|
||||
ApplySpecPhase::HandleNeonExtension => {
|
||||
let operations = vec![
|
||||
Operation {
|
||||
query: String::from("CREATE SCHEMA IF NOT EXISTS neon"),
|
||||
comment: Some(String::from("init: add schema for extension")),
|
||||
},
|
||||
Operation {
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"),
|
||||
comment: Some(String::from(
|
||||
@@ -712,5 +716,9 @@ async fn get_operations<'a>(
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation {
|
||||
query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")),
|
||||
comment: None,
|
||||
}))),
|
||||
}
|
||||
}
|
||||
|
||||
21
compute_tools/src/sql/finalize_drop_subscriptions.sql
Normal file
21
compute_tools/src/sql/finalize_drop_subscriptions.sql
Normal file
@@ -0,0 +1,21 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_tables
|
||||
WHERE tablename = 'drop_subscriptions_done'
|
||||
AND schemaname = 'neon'
|
||||
)
|
||||
THEN
|
||||
CREATE TABLE neon.drop_subscriptions_done
|
||||
(id serial primary key, timeline_id text);
|
||||
END IF;
|
||||
|
||||
-- preserve the timeline_id of the last drop_subscriptions run
|
||||
-- to ensure that the cleanup of a timeline is executed only once.
|
||||
-- use upsert to avoid the table bloat in case of cascade branching (branch of a branch)
|
||||
INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id'))
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET timeline_id = current_setting('neon.timeline_id');
|
||||
END
|
||||
$$
|
||||
@@ -1,6 +1,10 @@
|
||||
# Control Plane and Neon Local
|
||||
# Local Development Control Plane (`neon_local`)
|
||||
|
||||
This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
|
||||
This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command. This is a convenience to invoke
|
||||
the `neon_local` binary.
|
||||
|
||||
**Note**: this is a dev/test tool -- a minimal control plane suitable for testing
|
||||
code changes locally, but not suitable for running production systems.
|
||||
|
||||
## Example: Start with Postgres 16
|
||||
|
||||
|
||||
@@ -1357,6 +1357,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
args.pg_version,
|
||||
mode,
|
||||
!args.update_catalog,
|
||||
false,
|
||||
)?;
|
||||
}
|
||||
EndpointCmd::Start(args) => {
|
||||
|
||||
@@ -76,6 +76,7 @@ pub struct EndpointConf {
|
||||
http_port: u16,
|
||||
pg_version: u32,
|
||||
skip_pg_catalog_updates: bool,
|
||||
drop_subscriptions_before_start: bool,
|
||||
features: Vec<ComputeFeature>,
|
||||
}
|
||||
|
||||
@@ -143,6 +144,7 @@ impl ComputeControlPlane {
|
||||
pg_version: u32,
|
||||
mode: ComputeMode,
|
||||
skip_pg_catalog_updates: bool,
|
||||
drop_subscriptions_before_start: bool,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
@@ -162,6 +164,7 @@ impl ComputeControlPlane {
|
||||
// with this we basically test a case of waking up an idle compute, where
|
||||
// we also skip catalog updates in the cloud.
|
||||
skip_pg_catalog_updates,
|
||||
drop_subscriptions_before_start,
|
||||
features: vec![],
|
||||
});
|
||||
|
||||
@@ -177,6 +180,7 @@ impl ComputeControlPlane {
|
||||
pg_port,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates,
|
||||
drop_subscriptions_before_start,
|
||||
features: vec![],
|
||||
})?,
|
||||
)?;
|
||||
@@ -240,6 +244,7 @@ pub struct Endpoint {
|
||||
// Optimizations
|
||||
skip_pg_catalog_updates: bool,
|
||||
|
||||
drop_subscriptions_before_start: bool,
|
||||
// Feature flags
|
||||
features: Vec<ComputeFeature>,
|
||||
}
|
||||
@@ -291,6 +296,7 @@ impl Endpoint {
|
||||
tenant_id: conf.tenant_id,
|
||||
pg_version: conf.pg_version,
|
||||
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
|
||||
drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
|
||||
features: conf.features,
|
||||
})
|
||||
}
|
||||
@@ -625,6 +631,7 @@ impl Endpoint {
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: 1,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
|
||||
@@ -347,11 +347,31 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_threshold' as an integer")?,
|
||||
compaction_upper_limit: settings
|
||||
.remove("compaction_upper_limit")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_upper_limit' as an integer")?,
|
||||
compaction_algorithm: settings
|
||||
.remove("compaction_algorithm")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_algorithm' json")?,
|
||||
l0_flush_delay_threshold: settings
|
||||
.remove("l0_flush_delay_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'l0_flush_delay_threshold' as an integer")?,
|
||||
l0_flush_wait_upload: settings
|
||||
.remove("l0_flush_wait_upload")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'l0_flush_wait_upload' as a boolean")?,
|
||||
l0_flush_stall_threshold: settings
|
||||
.remove("l0_flush_stall_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'l0_flush_stall_threshold' as an integer")?,
|
||||
gc_horizon: settings
|
||||
.remove("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
@@ -423,6 +443,21 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'rel_size_v2_enabled' as bool")?,
|
||||
gc_compaction_enabled: settings
|
||||
.remove("gc_compaction_enabled")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_compaction_enabled' as bool")?,
|
||||
gc_compaction_initial_threshold_kb: settings
|
||||
.remove("gc_compaction_initial_threshold_kb")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_compaction_initial_threshold_kb' as integer")?,
|
||||
gc_compaction_ratio_percent: settings
|
||||
.remove("gc_compaction_ratio_percent")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_compaction_ratio_percent' as integer")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
|
||||
@@ -41,8 +41,8 @@ allow = [
|
||||
"MIT",
|
||||
"MPL-2.0",
|
||||
"OpenSSL",
|
||||
"Unicode-DFS-2016",
|
||||
"Unicode-3.0",
|
||||
"Zlib",
|
||||
]
|
||||
confidence-threshold = 0.8
|
||||
exceptions = [
|
||||
|
||||
@@ -10,10 +10,7 @@ USER root
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl \
|
||||
jq \
|
||||
python3-pip \
|
||||
netcat-openbsd
|
||||
#Faker is required for the pg_anon test
|
||||
RUN case $COMPUTE_IMAGE in compute-node-v17) OPT="--break-system-packages";; *) OPT= ;; esac && pip3 install $OPT Faker
|
||||
#This is required for the pg_hintplan test
|
||||
RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src
|
||||
|
||||
|
||||
@@ -20,30 +20,55 @@ while ! nc -z pageserver 6400; do
|
||||
done
|
||||
echo "Page server is ready."
|
||||
|
||||
echo "Create a tenant and timeline"
|
||||
generate_id tenant_id
|
||||
PARAMS=(
|
||||
-X PUT
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
cp ${SPEC_FILE_ORG} ${SPEC_FILE}
|
||||
|
||||
generate_id timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
|
||||
tenant_id=${TENANT_ID}
|
||||
timeline_id=${TIMELINE_ID}
|
||||
else
|
||||
echo "Check if a tenant present"
|
||||
PARAMS=(
|
||||
-X GET
|
||||
-H "Content-Type: application/json"
|
||||
"http://pageserver:9898/v1/tenant"
|
||||
)
|
||||
tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id)
|
||||
if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then
|
||||
echo "Create a tenant"
|
||||
generate_id tenant_id
|
||||
PARAMS=(
|
||||
-X PUT
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
fi
|
||||
|
||||
echo "Check if a timeline present"
|
||||
PARAMS=(
|
||||
-X GET
|
||||
-H "Content-Type: application/json"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
|
||||
)
|
||||
timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
|
||||
if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then
|
||||
generate_id timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Overwrite tenant id and timeline id in spec file"
|
||||
sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
|
||||
sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
|
||||
|
||||
cat ${SPEC_FILE}
|
||||
|
||||
@@ -149,11 +149,13 @@ services:
|
||||
args:
|
||||
- REPOSITORY=${REPOSITORY:-neondatabase}
|
||||
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
|
||||
- TAG=${TAG:-latest}
|
||||
- http_proxy=$http_proxy
|
||||
- https_proxy=$https_proxy
|
||||
- TAG=${COMPUTE_TAG:-${TAG:-latest}}
|
||||
- http_proxy=${http_proxy:-}
|
||||
- https_proxy=${https_proxy:-}
|
||||
environment:
|
||||
- PG_VERSION=${PG_VERSION:-16}
|
||||
- TENANT_ID=${TENANT_ID:-}
|
||||
- TIMELINE_ID=${TIMELINE_ID:-}
|
||||
#- RUST_BACKTRACE=1
|
||||
# Mount the test files directly, for faster editing cycle.
|
||||
volumes:
|
||||
@@ -185,6 +187,8 @@ services:
|
||||
neon-test-extensions:
|
||||
profiles: ["test-extensions"]
|
||||
image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
|
||||
environment:
|
||||
- PGPASSWORD=cloud_admin
|
||||
entrypoint:
|
||||
- "/bin/bash"
|
||||
- "-c"
|
||||
|
||||
@@ -18,14 +18,10 @@ cd $(dirname $0)
|
||||
COMPUTE_CONTAINER_NAME=docker-compose-compute-1
|
||||
TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
|
||||
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
|
||||
: ${http_proxy:=}
|
||||
: ${https_proxy:=}
|
||||
export http_proxy https_proxy
|
||||
|
||||
cleanup() {
|
||||
echo "show container information"
|
||||
docker ps
|
||||
docker compose --profile test-extensions -f $COMPOSE_FILE logs
|
||||
echo "stop containers..."
|
||||
docker compose --profile test-extensions -f $COMPOSE_FILE down
|
||||
}
|
||||
@@ -35,12 +31,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
echo "clean up containers if exists"
|
||||
cleanup
|
||||
PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
|
||||
# The support of pg_anon not yet added to PG17, so we have to add the corresponding option for other PG versions
|
||||
if [ "${pg_version}" -ne 17 ]; then
|
||||
SPEC_PATH="compute_wrapper/var/db/postgres/specs"
|
||||
mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak
|
||||
jq '.cluster.settings += [{"name": "session_preload_libraries","value": "anon","vartype": "string"}]' "${SPEC_PATH}/spec.bak" > "${SPEC_PATH}/spec.json"
|
||||
fi
|
||||
PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
|
||||
|
||||
echo "wait until the compute is ready. timeout after 60s. "
|
||||
@@ -50,7 +40,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
cnt=`expr $cnt + 3`
|
||||
if [ $cnt -gt 60 ]; then
|
||||
echo "timeout before the compute is ready."
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
|
||||
@@ -62,36 +51,19 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
done
|
||||
|
||||
if [ $pg_version -ge 16 ]; then
|
||||
echo Enabling trust connection
|
||||
docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
|
||||
echo Adding postgres role
|
||||
docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
|
||||
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config
|
||||
docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
|
||||
# This block is required for the pg_anon extension test.
|
||||
# The test assumes that it is running on the same host with the postgres engine.
|
||||
# In our case it's not true, that's why we are copying files to the compute node
|
||||
# The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
|
||||
TMPDIR=$(mktemp -d)
|
||||
# Add support for pg_anon for pg_v16
|
||||
if [ $pg_version -ne 17 ]; then
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
|
||||
echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
|
||||
rm -rf $TMPDIR
|
||||
fi
|
||||
TMPDIR=$(mktemp -d)
|
||||
# The following block does the same for the pg_hintplan test
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
|
||||
rm -rf $TMPDIR
|
||||
# We are running tests now
|
||||
if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
$TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
|
||||
then
|
||||
cleanup
|
||||
else
|
||||
FAILED=$(tail -1 testout.txt)
|
||||
for d in $FAILED
|
||||
do
|
||||
@@ -101,13 +73,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
cat $d/regression.out $d/regression.diffs || true
|
||||
done
|
||||
rm -rf $FAILED
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
cleanup
|
||||
# Restore the original spec.json
|
||||
if [ "$pg_version" -ne 17 ]; then
|
||||
mv "$SPEC_PATH/spec.bak" "$SPEC_PATH/spec.json"
|
||||
fi
|
||||
done
|
||||
|
||||
5
docker-compose/ext-src/hll-src/test-upgrade.sh
Executable file
5
docker-compose/ext-src/hll-src/test-upgrade.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression add_agg agg_oob auto_sparse card_op cast_shape copy_binary cumulative_add_cardinality_correction cumulative_add_comprehensive_promotion cumulative_add_sparse_edge cumulative_add_sparse_random cumulative_add_sparse_step cumulative_union_comprehensive cumulative_union_explicit_explicit cumulative_union_explicit_promotion cumulative_union_probabilistic_probabilistic cumulative_union_sparse_full_representation cumulative_union_sparse_promotion cumulative_union_sparse_sparse disable_hashagg equal explicit_thresh hash hash_any meta_func murmur_bigint murmur_bytea nosparse notequal scalar_oob storedproc transaction typmod typmod_insert union_op
|
||||
27
docker-compose/ext-src/hypopg-src/test-upgrade.patch
Normal file
27
docker-compose/ext-src/hypopg-src/test-upgrade.patch
Normal file
@@ -0,0 +1,27 @@
|
||||
diff --git a/expected/hypopg.out b/expected/hypopg.out
|
||||
index 90121d0..859260b 100644
|
||||
--- a/expected/hypopg.out
|
||||
+++ b/expected/hypopg.out
|
||||
@@ -11,7 +11,8 @@ BEGIN
|
||||
END;
|
||||
$_$
|
||||
LANGUAGE plpgsql;
|
||||
-CREATE EXTENSION hypopg;
|
||||
+CREATE EXTENSION IF NOT EXISTS hypopg;
|
||||
+NOTICE: extension "hypopg" already exists, skipping
|
||||
CREATE TABLE hypo (id integer, val text, "Id2" bigint);
|
||||
INSERT INTO hypo SELECT i, 'line ' || i
|
||||
FROM generate_series(1,100000) f(i);
|
||||
diff --git a/test/sql/hypopg.sql b/test/sql/hypopg.sql
|
||||
index 99722b0..8d6bacb 100644
|
||||
--- a/test/sql/hypopg.sql
|
||||
+++ b/test/sql/hypopg.sql
|
||||
@@ -12,7 +12,7 @@ END;
|
||||
$_$
|
||||
LANGUAGE plpgsql;
|
||||
|
||||
-CREATE EXTENSION hypopg;
|
||||
+CREATE EXTENSION IF NOT EXISTS hypopg;
|
||||
|
||||
CREATE TABLE hypo (id integer, val text, "Id2" bigint);
|
||||
|
||||
6
docker-compose/ext-src/hypopg-src/test-upgrade.sh
Executable file
6
docker-compose/ext-src/hypopg-src/test-upgrade.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
patch -p1 <test-upgrade.patch
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --inputdir=test --dbname=contrib_regression hypopg hypo_brin hypo_index_part hypo_include hypo_hash hypo_hide_index
|
||||
23
docker-compose/ext-src/ip4r-src/test-upgrade.patch
Normal file
23
docker-compose/ext-src/ip4r-src/test-upgrade.patch
Normal file
@@ -0,0 +1,23 @@
|
||||
diff --git a/expected/ip4r.out b/expected/ip4r.out
|
||||
index 7527af3..b38ed29 100644
|
||||
--- a/expected/ip4r.out
|
||||
+++ b/expected/ip4r.out
|
||||
@@ -1,6 +1,5 @@
|
||||
--
|
||||
/*CUT-HERE*/
|
||||
-CREATE EXTENSION ip4r;
|
||||
-- Check whether any of our opclasses fail amvalidate
|
||||
DO $d$
|
||||
DECLARE
|
||||
diff --git a/sql/ip4r.sql b/sql/ip4r.sql
|
||||
index 65c49ec..24ade09 100644
|
||||
--- a/sql/ip4r.sql
|
||||
+++ b/sql/ip4r.sql
|
||||
@@ -1,7 +1,6 @@
|
||||
--
|
||||
|
||||
/*CUT-HERE*/
|
||||
-CREATE EXTENSION ip4r;
|
||||
|
||||
-- Check whether any of our opclasses fail amvalidate
|
||||
|
||||
6
docker-compose/ext-src/ip4r-src/test-upgrade.sh
Executable file
6
docker-compose/ext-src/ip4r-src/test-upgrade.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
patch -p1 <test-upgrade.patch
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression ip4r ip4r-softerr ip4r-v11
|
||||
75
docker-compose/ext-src/pg_cron-src/test-upgrade.patch
Normal file
75
docker-compose/ext-src/pg_cron-src/test-upgrade.patch
Normal file
@@ -0,0 +1,75 @@
|
||||
diff --git a/expected/pg_cron-test.out b/expected/pg_cron-test.out
|
||||
index d79d542..1663886 100644
|
||||
--- a/expected/pg_cron-test.out
|
||||
+++ b/expected/pg_cron-test.out
|
||||
@@ -1,30 +1,3 @@
|
||||
-CREATE EXTENSION pg_cron VERSION '1.0';
|
||||
-SELECT extversion FROM pg_extension WHERE extname='pg_cron';
|
||||
- extversion
|
||||
-------------
|
||||
- 1.0
|
||||
-(1 row)
|
||||
-
|
||||
--- Test binary compatibility with v1.4 function signature.
|
||||
-ALTER EXTENSION pg_cron UPDATE TO '1.4';
|
||||
-SELECT cron.unschedule(job_name := 'no_such_job');
|
||||
-ERROR: could not find valid entry for job 'no_such_job'
|
||||
-SELECT cron.schedule('testjob', '* * * * *', 'SELECT 1');
|
||||
- schedule
|
||||
-----------
|
||||
- 1
|
||||
-(1 row)
|
||||
-
|
||||
-SELECT cron.unschedule('testjob');
|
||||
- unschedule
|
||||
-------------
|
||||
- t
|
||||
-(1 row)
|
||||
-
|
||||
--- Test cache invalidation
|
||||
-DROP EXTENSION pg_cron;
|
||||
-CREATE EXTENSION pg_cron VERSION '1.4';
|
||||
-ALTER EXTENSION pg_cron UPDATE;
|
||||
-- Vacuum every day at 10:00am (GMT)
|
||||
SELECT cron.schedule('0 10 * * *', 'VACUUM');
|
||||
schedule
|
||||
@@ -300,8 +273,3 @@ SELECT jobid, jobname, schedule, command FROM cron.job ORDER BY jobid;
|
||||
SELECT cron.schedule('bad-last-dom-job1', '0 11 $foo * *', 'VACUUM FULL');
|
||||
ERROR: invalid schedule: 0 11 $foo * *
|
||||
HINT: Use cron format (e.g. 5 4 * * *), or interval format '[1-59] seconds'
|
||||
--- cleaning
|
||||
-DROP EXTENSION pg_cron;
|
||||
-drop user pgcron_cront;
|
||||
-drop database pgcron_dbno;
|
||||
-drop database pgcron_dbyes;
|
||||
diff --git a/sql/pg_cron-test.sql b/sql/pg_cron-test.sql
|
||||
index 45f94d9..241cf73 100644
|
||||
--- a/sql/pg_cron-test.sql
|
||||
+++ b/sql/pg_cron-test.sql
|
||||
@@ -1,17 +1,3 @@
|
||||
-CREATE EXTENSION pg_cron VERSION '1.0';
|
||||
-SELECT extversion FROM pg_extension WHERE extname='pg_cron';
|
||||
--- Test binary compatibility with v1.4 function signature.
|
||||
-ALTER EXTENSION pg_cron UPDATE TO '1.4';
|
||||
-SELECT cron.unschedule(job_name := 'no_such_job');
|
||||
-SELECT cron.schedule('testjob', '* * * * *', 'SELECT 1');
|
||||
-SELECT cron.unschedule('testjob');
|
||||
-
|
||||
--- Test cache invalidation
|
||||
-DROP EXTENSION pg_cron;
|
||||
-CREATE EXTENSION pg_cron VERSION '1.4';
|
||||
-
|
||||
-ALTER EXTENSION pg_cron UPDATE;
|
||||
-
|
||||
-- Vacuum every day at 10:00am (GMT)
|
||||
SELECT cron.schedule('0 10 * * *', 'VACUUM');
|
||||
|
||||
@@ -156,8 +142,3 @@ SELECT jobid, jobname, schedule, command FROM cron.job ORDER BY jobid;
|
||||
-- invalid last of day job
|
||||
SELECT cron.schedule('bad-last-dom-job1', '0 11 $foo * *', 'VACUUM FULL');
|
||||
|
||||
--- cleaning
|
||||
-DROP EXTENSION pg_cron;
|
||||
-drop user pgcron_cront;
|
||||
-drop database pgcron_dbno;
|
||||
-drop database pgcron_dbyes;
|
||||
6
docker-compose/ext-src/pg_cron-src/test-upgrade.sh
Executable file
6
docker-compose/ext-src/pg_cron-src/test-upgrade.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
patch -p1 <test-upgrade.patch
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression pg_cron-test
|
||||
18
docker-compose/ext-src/pg_ivm-src/test-upgrade.patch
Normal file
18
docker-compose/ext-src/pg_ivm-src/test-upgrade.patch
Normal file
@@ -0,0 +1,18 @@
|
||||
diff --git a/expected/pg_ivm.out b/expected/pg_ivm.out
|
||||
index e8798ee..cca58d0 100644
|
||||
--- a/expected/pg_ivm.out
|
||||
+++ b/expected/pg_ivm.out
|
||||
@@ -1,4 +1,3 @@
|
||||
-CREATE EXTENSION pg_ivm;
|
||||
GRANT ALL ON SCHEMA public TO public;
|
||||
-- create a table to use as a basis for views and materialized views in various combinations
|
||||
CREATE TABLE mv_base_a (i int, j int);
|
||||
diff --git a/sql/pg_ivm.sql b/sql/pg_ivm.sql
|
||||
index d3c1a01..9382d7f 100644
|
||||
--- a/sql/pg_ivm.sql
|
||||
+++ b/sql/pg_ivm.sql
|
||||
@@ -1,4 +1,3 @@
|
||||
-CREATE EXTENSION pg_ivm;
|
||||
GRANT ALL ON SCHEMA public TO public;
|
||||
|
||||
-- create a table to use as a basis for views and materialized views in various combinations
|
||||
6
docker-compose/ext-src/pg_ivm-src/test-upgrade.sh
Executable file
6
docker-compose/ext-src/pg_ivm-src/test-upgrade.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
patch -p1 <test-upgrade.patch
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression pg_ivm create_immv refresh_immv
|
||||
@@ -0,0 +1,25 @@
|
||||
diff --git a/expected/roaringbitmap.out b/expected/roaringbitmap.out
|
||||
index de70531..a5f7c15 100644
|
||||
--- a/expected/roaringbitmap.out
|
||||
+++ b/expected/roaringbitmap.out
|
||||
@@ -1,7 +1,6 @@
|
||||
--
|
||||
-- Test roaringbitmap extension
|
||||
--
|
||||
-CREATE EXTENSION if not exists roaringbitmap;
|
||||
-- Test input and output
|
||||
set roaringbitmap.output_format='array';
|
||||
set extra_float_digits = 0;
|
||||
diff --git a/sql/roaringbitmap.sql b/sql/roaringbitmap.sql
|
||||
index a0e9c74..84bc966 100644
|
||||
--- a/sql/roaringbitmap.sql
|
||||
+++ b/sql/roaringbitmap.sql
|
||||
@@ -2,8 +2,6 @@
|
||||
-- Test roaringbitmap extension
|
||||
--
|
||||
|
||||
-CREATE EXTENSION if not exists roaringbitmap;
|
||||
-
|
||||
-- Test input and output
|
||||
|
||||
set roaringbitmap.output_format='array';
|
||||
6
docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.sh
Executable file
6
docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
patch -p1 <test-upgrade.patch
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression roaringbitmap
|
||||
24
docker-compose/ext-src/pg_semver-src/test-upgrade.patch
Normal file
24
docker-compose/ext-src/pg_semver-src/test-upgrade.patch
Normal file
@@ -0,0 +1,24 @@
|
||||
diff --git a/test/sql/base.sql b/test/sql/base.sql
|
||||
index af599d8..2eed91b 100644
|
||||
--- a/test/sql/base.sql
|
||||
+++ b/test/sql/base.sql
|
||||
@@ -2,7 +2,6 @@
|
||||
BEGIN;
|
||||
|
||||
\i test/pgtap-core.sql
|
||||
-\i sql/semver.sql
|
||||
|
||||
SELECT plan(334);
|
||||
--SELECT * FROM no_plan();
|
||||
diff --git a/test/sql/corpus.sql b/test/sql/corpus.sql
|
||||
index 1f5f637..a519905 100644
|
||||
--- a/test/sql/corpus.sql
|
||||
+++ b/test/sql/corpus.sql
|
||||
@@ -4,7 +4,6 @@ BEGIN;
|
||||
-- Test the SemVer corpus from https://regex101.com/r/Ly7O1x/3/.
|
||||
|
||||
\i test/pgtap-core.sql
|
||||
-\i sql/semver.sql
|
||||
|
||||
SELECT plan(71);
|
||||
--SELECT * FROM no_plan();
|
||||
6
docker-compose/ext-src/pg_semver-src/test-upgrade.sh
Executable file
6
docker-compose/ext-src/pg_semver-src/test-upgrade.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
patch -p1 <test-upgrade.patch
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --inputdir=test --dbname=contrib_regression base corpus
|
||||
5
docker-compose/ext-src/pg_uuidv7-src/test-upgrade.sh
Executable file
5
docker-compose/ext-src/pg_uuidv7-src/test-upgrade.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --inputdir=test --dbname=contrib_regression 002_uuid_generate_v7 003_uuid_v7_to_timestamptz 004_uuid_timestamptz_to_v7 005_uuid_v7_to_timestamp 006_uuid_timestamp_to_v7
|
||||
5
docker-compose/ext-src/pgvector-src/test-upgrade.sh
Executable file
5
docker-compose/ext-src/pgvector-src/test-upgrade.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --inputdir=test --use-existing --dbname=contrib_regression bit btree cast copy halfvec hnsw_bit hnsw_halfvec hnsw_sparsevec hnsw_vector ivfflat_bit ivfflat_halfvec ivfflat_vector sparsevec vector_type
|
||||
5
docker-compose/ext-src/plv8-src/test-upgrade.sh
Executable file
5
docker-compose/ext-src/plv8-src/test-upgrade.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression plv8 plv8-errors scalar_args inline json startup_pre startup varparam json_conv jsonb_conv window guc es6 arraybuffer composites currentresource startup_perms bytea find_function_perms memory_limits reset show array_spread regression dialect bigint procedure
|
||||
5
docker-compose/ext-src/postgresql-unit-src/test-upgrade.sh
Executable file
5
docker-compose/ext-src/postgresql-unit-src/test-upgrade.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression extension tables unit binary unicode prefix units time temperature functions language_functions round derived compare aggregate iec custom crosstab convert
|
||||
5
docker-compose/ext-src/prefix-src/test-upgrade.sh
Executable file
5
docker-compose/ext-src/prefix-src/test-upgrade.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression prefix falcon explain queries
|
||||
19
docker-compose/ext-src/rum-src/test-upgrade.patch
Normal file
19
docker-compose/ext-src/rum-src/test-upgrade.patch
Normal file
@@ -0,0 +1,19 @@
|
||||
diff --git a/expected/rum.out b/expected/rum.out
|
||||
index 5966d19..8860b79 100644
|
||||
--- a/expected/rum.out
|
||||
+++ b/expected/rum.out
|
||||
@@ -1,4 +1,3 @@
|
||||
-CREATE EXTENSION rum;
|
||||
CREATE TABLE test_rum( t text, a tsvector );
|
||||
CREATE TRIGGER tsvectorupdate
|
||||
BEFORE UPDATE OR INSERT ON test_rum
|
||||
diff --git a/sql/rum.sql b/sql/rum.sql
|
||||
index 8414bb9..898e6ab 100644
|
||||
--- a/sql/rum.sql
|
||||
+++ b/sql/rum.sql
|
||||
@@ -1,5 +1,3 @@
|
||||
-CREATE EXTENSION rum;
|
||||
-
|
||||
CREATE TABLE test_rum( t text, a tsvector );
|
||||
|
||||
CREATE TRIGGER tsvectorupdate
|
||||
6
docker-compose/ext-src/rum-src/test-upgrade.sh
Executable file
6
docker-compose/ext-src/rum-src/test-upgrade.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
patch -p1 <test-upgrade.patch
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_validate rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array
|
||||
@@ -7,7 +7,10 @@ LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||
for d in ${LIST}
|
||||
do
|
||||
[ -d "${d}" ] || continue
|
||||
psql -c "select 1" >/dev/null || break
|
||||
if ! psql -w -c "select 1" >/dev/null; then
|
||||
FAILED="${d} ${FAILED}"
|
||||
break
|
||||
fi
|
||||
USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
|
||||
done
|
||||
[ -z "${FAILED}" ] && exit 0
|
||||
|
||||
93
docker-compose/test_extensions_upgrade.sh
Executable file
93
docker-compose/test_extensions_upgrade.sh
Executable file
@@ -0,0 +1,93 @@
|
||||
#!/bin/bash
|
||||
set -eux -o pipefail
|
||||
cd "$(dirname "${0}")"
|
||||
# Takes a variable name as argument. The result is stored in that variable.
|
||||
generate_id() {
|
||||
local -n resvar=$1
|
||||
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
|
||||
}
|
||||
if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEWTAG}" ]; then
|
||||
echo OLDTAG and NEWTAG must be defined
|
||||
exit 1
|
||||
fi
|
||||
export PG_VERSION=${PG_VERSION:-16}
|
||||
function wait_for_ready {
|
||||
TIME=0
|
||||
while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do
|
||||
((TIME += 1 ))
|
||||
sleep 1
|
||||
done
|
||||
if [ ${TIME} -gt 300 ]; then
|
||||
echo Time is out.
|
||||
exit 2
|
||||
fi
|
||||
}
|
||||
function create_extensions() {
|
||||
for ext in ${1}; do
|
||||
docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
|
||||
done
|
||||
}
|
||||
EXTENSIONS='[
|
||||
{"extname": "plv8", "extdir": "plv8-src"},
|
||||
{"extname": "vector", "extdir": "pgvector-src"},
|
||||
{"extname": "unit", "extdir": "postgresql-unit-src"},
|
||||
{"extname": "hypopg", "extdir": "hypopg-src"},
|
||||
{"extname": "rum", "extdir": "rum-src"},
|
||||
{"extname": "ip4r", "extdir": "ip4r-src"},
|
||||
{"extname": "prefix", "extdir": "prefix-src"},
|
||||
{"extname": "hll", "extdir": "hll-src"},
|
||||
{"extname": "pg_cron", "extdir": "pg_cron-src"},
|
||||
{"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
|
||||
{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
|
||||
{"extname": "semver", "extdir": "pg_semver-src"},
|
||||
{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
|
||||
]'
|
||||
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
|
||||
TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
|
||||
wait_for_ready
|
||||
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
|
||||
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
|
||||
create_extensions "${EXTNAMES}"
|
||||
query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
|
||||
new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
|
||||
docker compose --profile test-extensions down
|
||||
TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
|
||||
wait_for_ready
|
||||
docker compose cp ext-src neon-test-extensions:/
|
||||
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
|
||||
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
|
||||
create_extensions "${EXTNAMES}"
|
||||
query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
|
||||
exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
|
||||
if [ -z "${exts}" ]; then
|
||||
echo "No extensions were upgraded"
|
||||
else
|
||||
tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id")
|
||||
timeline_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
|
||||
for ext in ${exts}; do
|
||||
echo Testing ${ext}...
|
||||
EXTDIR=$(echo ${EXTENSIONS} | jq -r '.[] | select(.extname=="'${ext}'") | .extdir')
|
||||
generate_id new_timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${timeline_id}\"}"
|
||||
"http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} TAG=${OLDTAG} docker compose down compute compute_is_ready
|
||||
COMPUTE_TAG=${NEWTAG} TAG=${OLDTAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready
|
||||
wait_for_ready
|
||||
TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
|
||||
if [ ${TID} != ${new_timeline_id} ]; then
|
||||
echo Timeline mismatch
|
||||
exit 1
|
||||
fi
|
||||
docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
|
||||
docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh
|
||||
docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update"
|
||||
docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
|
||||
done
|
||||
fi
|
||||
@@ -138,6 +138,13 @@ pub struct ComputeSpec {
|
||||
/// enough spare connections for reconfiguration process to succeed.
|
||||
#[serde(default = "default_reconfigure_concurrency")]
|
||||
pub reconfigure_concurrency: usize,
|
||||
|
||||
/// If set to true, the compute_ctl will drop all subscriptions before starting the
|
||||
/// compute. This is needed when we start an endpoint on a branch, so that child
|
||||
/// would not compete with parent branch subscriptions
|
||||
/// over the same replication content from publisher.
|
||||
#[serde(default)] // Default false
|
||||
pub drop_subscriptions_before_start: bool,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
|
||||
@@ -120,6 +120,7 @@ pub struct ConfigToml {
|
||||
pub no_sync: Option<bool>,
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
pub page_service_pipelining: PageServicePipeliningConfig,
|
||||
pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -158,6 +159,25 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
|
||||
Tasks,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case")]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub enum GetVectoredConcurrentIo {
|
||||
/// The read path is fully sequential: layers are visited
|
||||
/// one after the other and IOs are issued and waited upon
|
||||
/// from the same task that traverses the layers.
|
||||
Sequential,
|
||||
/// The read path still traverses layers sequentially, and
|
||||
/// index blocks will be read into the PS PageCache from
|
||||
/// that task, with waiting.
|
||||
/// But data IOs are dispatched and waited upon from a sidecar
|
||||
/// task so that the traversing task can continue to traverse
|
||||
/// layers while the IOs are in flight.
|
||||
/// If the PS PageCache miss rate is low, this improves
|
||||
/// throughput dramatically.
|
||||
SidecarTask,
|
||||
}
|
||||
|
||||
pub mod statvfs {
|
||||
pub mod mock {
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -234,9 +254,26 @@ pub struct TenantConfigToml {
|
||||
// Duration::ZERO means automatic compaction is disabled.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub compaction_period: Duration,
|
||||
// Level0 delta layer threshold for compaction.
|
||||
/// Level0 delta layer threshold for compaction.
|
||||
pub compaction_threshold: usize,
|
||||
/// Controls the amount of L0 included in a single compaction iteration.
|
||||
/// The unit is `checkpoint_distance`, i.e., a size.
|
||||
/// We add L0s to the set of layers to compact until their cumulative
|
||||
/// size exceeds `compaction_upper_limit * checkpoint_distance`.
|
||||
pub compaction_upper_limit: usize,
|
||||
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
|
||||
/// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
|
||||
/// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
|
||||
/// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
|
||||
/// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default.
|
||||
pub l0_flush_delay_threshold: Option<usize>,
|
||||
/// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
|
||||
/// to avoid deadlock. 0 to disable. Disabled by default.
|
||||
pub l0_flush_stall_threshold: Option<usize>,
|
||||
/// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next
|
||||
/// layer. This is a temporary backpressure mechanism which should be removed once
|
||||
/// l0_flush_{delay,stall}_threshold is fully enabled.
|
||||
pub l0_flush_wait_upload: bool,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is #of bytes of WAL.
|
||||
@@ -305,6 +342,16 @@ pub struct TenantConfigToml {
|
||||
/// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
|
||||
/// `index_part.json`, and it cannot be reversed.
|
||||
pub rel_size_v2_enabled: Option<bool>,
|
||||
|
||||
// gc-compaction related configs
|
||||
/// Enable automatic gc-compaction trigger on this tenant.
|
||||
pub gc_compaction_enabled: bool,
|
||||
/// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
|
||||
/// gc-compaction will be triggered.
|
||||
pub gc_compaction_initial_threshold_kb: u64,
|
||||
/// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN)
|
||||
/// is above this ratio, gc-compaction will be triggered.
|
||||
pub gc_compaction_ratio_percent: u64,
|
||||
}
|
||||
|
||||
pub mod defaults {
|
||||
@@ -454,6 +501,11 @@ impl Default for ConfigToml {
|
||||
execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
|
||||
})
|
||||
},
|
||||
get_vectored_concurrent_io: if !cfg!(test) {
|
||||
GetVectoredConcurrentIo::Sequential
|
||||
} else {
|
||||
GetVectoredConcurrentIo::SidecarTask
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -476,9 +528,17 @@ pub mod tenant_conf_defaults {
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||
|
||||
// This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on
|
||||
// most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
|
||||
// calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
|
||||
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
|
||||
|
||||
pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
|
||||
crate::models::CompactionAlgorithm::Legacy;
|
||||
|
||||
pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true;
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
|
||||
// Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
|
||||
@@ -498,6 +558,9 @@ pub mod tenant_conf_defaults {
|
||||
// By default ingest enough WAL for two new L0 layers before checking if new image
|
||||
// image layers should be created.
|
||||
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||
pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
|
||||
pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 10240000;
|
||||
pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
|
||||
}
|
||||
|
||||
impl Default for TenantConfigToml {
|
||||
@@ -511,9 +574,13 @@ impl Default for TenantConfigToml {
|
||||
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period"),
|
||||
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
||||
compaction_upper_limit: DEFAULT_COMPACTION_UPPER_LIMIT,
|
||||
compaction_algorithm: crate::models::CompactionAlgorithmSettings {
|
||||
kind: DEFAULT_COMPACTION_ALGORITHM,
|
||||
},
|
||||
l0_flush_delay_threshold: None,
|
||||
l0_flush_stall_threshold: None,
|
||||
l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
@@ -543,6 +610,9 @@ impl Default for TenantConfigToml {
|
||||
timeline_offloading: false,
|
||||
wal_receiver_protocol_override: None,
|
||||
rel_size_v2_enabled: None,
|
||||
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
|
||||
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
|
||||
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -282,7 +282,7 @@ pub struct TimelineCreateRequest {
|
||||
pub mode: TimelineCreateRequestMode,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
#[serde(untagged)]
|
||||
pub enum TimelineCreateRequestMode {
|
||||
Branch {
|
||||
@@ -307,7 +307,7 @@ pub enum TimelineCreateRequestMode {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct TimelineCreateRequestModeImportPgdata {
|
||||
pub location: ImportPgdataLocation,
|
||||
pub idempotency_key: ImportPgdataIdempotencyKey,
|
||||
@@ -326,7 +326,7 @@ pub enum ImportPgdataLocation {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
#[serde(transparent)]
|
||||
pub struct ImportPgdataIdempotencyKey(pub String);
|
||||
|
||||
@@ -458,10 +458,18 @@ pub struct TenantConfigPatch {
|
||||
pub compaction_period: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_threshold: FieldPatch<usize>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_upper_limit: FieldPatch<usize>,
|
||||
// defer parsing compaction_algorithm, like eviction_policy
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub l0_flush_delay_threshold: FieldPatch<usize>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub l0_flush_stall_threshold: FieldPatch<usize>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub l0_flush_wait_upload: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_horizon: FieldPatch<u64>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_period: FieldPatch<String>,
|
||||
@@ -499,6 +507,12 @@ pub struct TenantConfigPatch {
|
||||
pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub rel_size_v2_enabled: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_enabled: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_ratio_percent: FieldPatch<u64>,
|
||||
}
|
||||
|
||||
/// An alternative representation of `pageserver::tenant::TenantConf` with
|
||||
@@ -510,8 +524,12 @@ pub struct TenantConfig {
|
||||
pub compaction_target_size: Option<u64>,
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
pub compaction_upper_limit: Option<usize>,
|
||||
// defer parsing compaction_algorithm, like eviction_policy
|
||||
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||
pub l0_flush_delay_threshold: Option<usize>,
|
||||
pub l0_flush_stall_threshold: Option<usize>,
|
||||
pub l0_flush_wait_upload: Option<bool>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
@@ -531,6 +549,9 @@ pub struct TenantConfig {
|
||||
pub timeline_offloading: Option<bool>,
|
||||
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
|
||||
pub rel_size_v2_enabled: Option<bool>,
|
||||
pub gc_compaction_enabled: Option<bool>,
|
||||
pub gc_compaction_initial_threshold_kb: Option<u64>,
|
||||
pub gc_compaction_ratio_percent: Option<u64>,
|
||||
}
|
||||
|
||||
impl TenantConfig {
|
||||
@@ -541,7 +562,11 @@ impl TenantConfig {
|
||||
mut compaction_target_size,
|
||||
mut compaction_period,
|
||||
mut compaction_threshold,
|
||||
mut compaction_upper_limit,
|
||||
mut compaction_algorithm,
|
||||
mut l0_flush_delay_threshold,
|
||||
mut l0_flush_stall_threshold,
|
||||
mut l0_flush_wait_upload,
|
||||
mut gc_horizon,
|
||||
mut gc_period,
|
||||
mut image_creation_threshold,
|
||||
@@ -561,6 +586,9 @@ impl TenantConfig {
|
||||
mut timeline_offloading,
|
||||
mut wal_receiver_protocol_override,
|
||||
mut rel_size_v2_enabled,
|
||||
mut gc_compaction_enabled,
|
||||
mut gc_compaction_initial_threshold_kb,
|
||||
mut gc_compaction_ratio_percent,
|
||||
} = self;
|
||||
|
||||
patch.checkpoint_distance.apply(&mut checkpoint_distance);
|
||||
@@ -570,7 +598,17 @@ impl TenantConfig {
|
||||
.apply(&mut compaction_target_size);
|
||||
patch.compaction_period.apply(&mut compaction_period);
|
||||
patch.compaction_threshold.apply(&mut compaction_threshold);
|
||||
patch
|
||||
.compaction_upper_limit
|
||||
.apply(&mut compaction_upper_limit);
|
||||
patch.compaction_algorithm.apply(&mut compaction_algorithm);
|
||||
patch
|
||||
.l0_flush_delay_threshold
|
||||
.apply(&mut l0_flush_delay_threshold);
|
||||
patch
|
||||
.l0_flush_stall_threshold
|
||||
.apply(&mut l0_flush_stall_threshold);
|
||||
patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
|
||||
patch.gc_horizon.apply(&mut gc_horizon);
|
||||
patch.gc_period.apply(&mut gc_period);
|
||||
patch
|
||||
@@ -606,6 +644,15 @@ impl TenantConfig {
|
||||
.wal_receiver_protocol_override
|
||||
.apply(&mut wal_receiver_protocol_override);
|
||||
patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
|
||||
patch
|
||||
.gc_compaction_enabled
|
||||
.apply(&mut gc_compaction_enabled);
|
||||
patch
|
||||
.gc_compaction_initial_threshold_kb
|
||||
.apply(&mut gc_compaction_initial_threshold_kb);
|
||||
patch
|
||||
.gc_compaction_ratio_percent
|
||||
.apply(&mut gc_compaction_ratio_percent);
|
||||
|
||||
Self {
|
||||
checkpoint_distance,
|
||||
@@ -613,7 +660,11 @@ impl TenantConfig {
|
||||
compaction_target_size,
|
||||
compaction_period,
|
||||
compaction_threshold,
|
||||
compaction_upper_limit,
|
||||
compaction_algorithm,
|
||||
l0_flush_delay_threshold,
|
||||
l0_flush_stall_threshold,
|
||||
l0_flush_wait_upload,
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
image_creation_threshold,
|
||||
@@ -633,6 +684,9 @@ impl TenantConfig {
|
||||
timeline_offloading,
|
||||
wal_receiver_protocol_override,
|
||||
rel_size_v2_enabled,
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -975,6 +1029,13 @@ pub struct TenantConfigPatchRequest {
|
||||
pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantWaitLsnRequest {
|
||||
#[serde(flatten)]
|
||||
pub timelines: HashMap<TimelineId, Lsn>,
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
|
||||
|
||||
@@ -377,7 +377,8 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
|
||||
let next_item = next_item?;
|
||||
|
||||
if timeout_try_cnt >= 2 {
|
||||
// Log a warning if we saw two timeouts in a row before a successful request
|
||||
if timeout_try_cnt > 2 {
|
||||
tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt);
|
||||
}
|
||||
timeout_try_cnt = 1;
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
use std::{fmt::Display, str::FromStr};
|
||||
|
||||
/// For types `V` that implement [`FromStr`].
|
||||
pub fn var<V, E>(varname: &str) -> Option<V>
|
||||
where
|
||||
V: FromStr<Err = E>,
|
||||
@@ -10,7 +11,9 @@ where
|
||||
match std::env::var(varname) {
|
||||
Ok(s) => Some(
|
||||
s.parse()
|
||||
.map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
|
||||
.map_err(|e| {
|
||||
format!("failed to parse env var {varname} using FromStr::parse: {e:#}")
|
||||
})
|
||||
.unwrap(),
|
||||
),
|
||||
Err(std::env::VarError::NotPresent) => None,
|
||||
@@ -19,3 +22,24 @@ where
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// For types `V` that implement [`serde::de::DeserializeOwned`].
|
||||
pub fn var_serde_json_string<V>(varname: &str) -> Option<V>
|
||||
where
|
||||
V: serde::de::DeserializeOwned,
|
||||
{
|
||||
match std::env::var(varname) {
|
||||
Ok(s) => Some({
|
||||
let value = serde_json::Value::String(s);
|
||||
serde_json::from_value(value)
|
||||
.map_err(|e| {
|
||||
format!("failed to parse env var {varname} as a serde_json json string: {e:#}")
|
||||
})
|
||||
.unwrap()
|
||||
}),
|
||||
Err(std::env::VarError::NotPresent) => None,
|
||||
Err(std::env::VarError::NotUnicode(_)) => {
|
||||
panic!("env var {varname} is not unicode")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,31 +11,55 @@ use tracing::*;
|
||||
|
||||
/// Declare a failpoint that can use to `pause` failpoint action.
|
||||
/// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
///
|
||||
/// Optionally pass a cancellation token, and this failpoint will drop out of
|
||||
/// its pause when the cancellation token fires. This is useful for testing
|
||||
/// cases where we would like to block something, but test its clean shutdown behavior.
|
||||
/// The macro evaluates to a Result in that case, where Ok(()) is the case
|
||||
/// where the failpoint was not paused, and Err() is the case where cancellation
|
||||
/// token fired while evaluating the failpoint.
|
||||
///
|
||||
/// Remember to unpause the failpoint in the test; until that happens, one of the
|
||||
/// limited number of spawn_blocking thread pool threads is leaked.
|
||||
#[macro_export]
|
||||
macro_rules! pausable_failpoint {
|
||||
($name:literal) => {
|
||||
($name:literal) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
let cancel = ::tokio_util::sync::CancellationToken::new();
|
||||
let _ = $crate::pausable_failpoint!($name, &cancel);
|
||||
}
|
||||
}};
|
||||
($name:literal, $cancel:expr) => {{
|
||||
if cfg!(feature = "testing") {
|
||||
let failpoint_fut = ::tokio::task::spawn_blocking({
|
||||
let current = ::tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!("at failpoint {}", $name);
|
||||
fail::fail_point!($name);
|
||||
::tracing::info!("at failpoint {}", $name);
|
||||
::fail::fail_point!($name);
|
||||
}
|
||||
});
|
||||
let cancel_fut = async move {
|
||||
$cancel.cancelled().await;
|
||||
};
|
||||
::tokio::select! {
|
||||
res = failpoint_fut => {
|
||||
res.expect("spawn_blocking");
|
||||
// continue with execution
|
||||
Ok(())
|
||||
},
|
||||
_ = cancel_fut => {
|
||||
Err(())
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
};
|
||||
($name:literal, $cond:expr) => {
|
||||
if cfg!(feature = "testing") {
|
||||
if $cond {
|
||||
pausable_failpoint!($name)
|
||||
}
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
};
|
||||
}};
|
||||
}
|
||||
|
||||
pub use pausable_failpoint;
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
///
|
||||
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
||||
|
||||
@@ -64,6 +64,12 @@ pub struct GateGuard {
|
||||
gate: Arc<GateInner>,
|
||||
}
|
||||
|
||||
impl GateGuard {
|
||||
pub fn try_clone(&self) -> Result<Self, GateError> {
|
||||
Gate::enter_impl(self.gate.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for GateGuard {
|
||||
fn drop(&mut self) {
|
||||
if self.gate.closing.load(Ordering::Relaxed) {
|
||||
@@ -107,11 +113,11 @@ impl Gate {
|
||||
/// to avoid blocking close() indefinitely: typically types that contain a Gate will
|
||||
/// also contain a CancellationToken.
|
||||
pub fn enter(&self) -> Result<GateGuard, GateError> {
|
||||
let permit = self
|
||||
.inner
|
||||
.sem
|
||||
.try_acquire()
|
||||
.map_err(|_| GateError::GateClosed)?;
|
||||
Self::enter_impl(self.inner.clone())
|
||||
}
|
||||
|
||||
fn enter_impl(gate: Arc<GateInner>) -> Result<GateGuard, GateError> {
|
||||
let permit = gate.sem.try_acquire().map_err(|_| GateError::GateClosed)?;
|
||||
|
||||
// we now have the permit, let's disable the normal raii functionality and leave
|
||||
// "returning" the permit to our GateGuard::drop.
|
||||
@@ -122,7 +128,7 @@ impl Gate {
|
||||
|
||||
Ok(GateGuard {
|
||||
span_at_enter: tracing::Span::current(),
|
||||
gate: self.inner.clone(),
|
||||
gate,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -252,4 +258,39 @@ mod tests {
|
||||
// Attempting to enter() is still forbidden
|
||||
gate.enter().expect_err("enter should fail finishing close");
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn clone_gate_guard() {
|
||||
let gate = Gate::default();
|
||||
let forever = Duration::from_secs(24 * 7 * 365);
|
||||
|
||||
let guard1 = gate.enter().expect("gate isn't closed");
|
||||
|
||||
let guard2 = guard1.try_clone().expect("gate isn't clsoed");
|
||||
|
||||
let mut close_fut = std::pin::pin!(gate.close());
|
||||
|
||||
tokio::time::timeout(forever, &mut close_fut)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
// we polled close_fut once, that should prevent all later enters and clones
|
||||
gate.enter().unwrap_err();
|
||||
guard1.try_clone().unwrap_err();
|
||||
guard2.try_clone().unwrap_err();
|
||||
|
||||
// guard2 keeps gate open even if guard1 is closed
|
||||
drop(guard1);
|
||||
tokio::time::timeout(forever, &mut close_fut)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
drop(guard2);
|
||||
|
||||
// now that the last guard is dropped, closing should complete
|
||||
close_fut.await;
|
||||
|
||||
// entering is still forbidden
|
||||
gate.enter().expect_err("enter should stilll fail");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
//! (notifying it of upscale).
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use axum::extract::ws::{Message, WebSocket};
|
||||
use axum::extract::ws::{Message, Utf8Bytes, WebSocket};
|
||||
use futures::{
|
||||
stream::{SplitSink, SplitStream},
|
||||
SinkExt, StreamExt,
|
||||
@@ -82,21 +82,21 @@ impl Dispatcher {
|
||||
|
||||
let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) {
|
||||
Ok(version) => {
|
||||
sink.send(Message::Text(
|
||||
sink.send(Message::Text(Utf8Bytes::from(
|
||||
serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
|
||||
))
|
||||
)))
|
||||
.await
|
||||
.context("failed to notify agent of negotiated protocol version")?;
|
||||
version
|
||||
}
|
||||
Err(e) => {
|
||||
sink.send(Message::Text(
|
||||
sink.send(Message::Text(Utf8Bytes::from(
|
||||
serde_json::to_string(&ProtocolResponse::Error(format!(
|
||||
"Received protocol version range {} which does not overlap with {}",
|
||||
agent_range, monitor_range
|
||||
)))
|
||||
.unwrap(),
|
||||
))
|
||||
)))
|
||||
.await
|
||||
.context("failed to notify agent of no overlap between protocol version ranges")?;
|
||||
Err(e).context("error determining suitable protocol version range")?
|
||||
@@ -126,7 +126,7 @@ impl Dispatcher {
|
||||
|
||||
let json = serde_json::to_string(&message).context("failed to serialize message")?;
|
||||
self.sink
|
||||
.send(Message::Text(json))
|
||||
.send(Message::Text(Utf8Bytes::from(json)))
|
||||
.await
|
||||
.context("stream error sending message")
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@ postgres_ffi.workspace = true
|
||||
serde.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["io-util"] }
|
||||
tonic.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -45,7 +45,7 @@ pub mod proto {
|
||||
#![allow(clippy::derive_partial_eq_without_eq)]
|
||||
// The generated ValueMeta has a `len` method generate for its `len` field.
|
||||
#![allow(clippy::len_without_is_empty)]
|
||||
tonic::include_proto!("interpreted_wal");
|
||||
include!(concat!(env!("OUT_DIR"), concat!("/interpreted_wal.rs")));
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Serialize, Deserialize)]
|
||||
|
||||
@@ -763,4 +763,19 @@ impl Client {
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn wait_lsn(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
request: TenantWaitLsnRequest,
|
||||
) -> Result<StatusCode> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/wait_lsn",
|
||||
self.mgmt_api_endpoint,
|
||||
);
|
||||
|
||||
self.request_noerror(Method::POST, uri, request)
|
||||
.await
|
||||
.map(|resp| resp.status())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,9 +160,12 @@ pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output:
|
||||
|
||||
// Fill in and thicken rectangle if it's an
|
||||
// image layer so that we can see it.
|
||||
let mut style = Style::default();
|
||||
style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
|
||||
style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
|
||||
let mut style = Style {
|
||||
fill: Fill::Color(rgb(0x80, 0x80, 0x80)),
|
||||
stroke: Stroke::Color(rgb(0, 0, 0), 0.5),
|
||||
opacity: 1.0,
|
||||
stroke_opacity: 1.0,
|
||||
};
|
||||
|
||||
let y_start = lsn_max - lsn_start;
|
||||
let y_end = lsn_max - lsn_end;
|
||||
@@ -214,10 +217,6 @@ pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output:
|
||||
files_seen.insert(f);
|
||||
}
|
||||
|
||||
let mut record_style = Style::default();
|
||||
record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
|
||||
record_style.stroke = Stroke::None;
|
||||
|
||||
writeln!(svg, "{}", EndSvg)?;
|
||||
|
||||
let mut layer_events_str = String::new();
|
||||
|
||||
@@ -13,7 +13,7 @@ use rand::prelude::*;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::info;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
@@ -63,6 +63,10 @@ pub(crate) struct Args {
|
||||
#[clap(long)]
|
||||
set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
|
||||
|
||||
/// Queue depth generated in each client.
|
||||
#[clap(long, default_value = "1")]
|
||||
queue_depth: NonZeroUsize,
|
||||
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
@@ -298,6 +302,7 @@ async fn main_impl(
|
||||
start_work_barrier.wait().await;
|
||||
let client_start = Instant::now();
|
||||
let mut ticks_processed = 0;
|
||||
let mut inflight = VecDeque::new();
|
||||
while !cancel.is_cancelled() {
|
||||
// Detect if a request took longer than the RPS rate
|
||||
if let Some(period) = &rps_period {
|
||||
@@ -311,31 +316,37 @@ async fn main_impl(
|
||||
ticks_processed = periods_passed_until_now;
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
while inflight.len() < args.queue_depth.get() {
|
||||
let start = Instant::now();
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
},
|
||||
not_modified_since: r.timeline_lsn,
|
||||
},
|
||||
not_modified_since: r.timeline_lsn,
|
||||
},
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
client.getpage(req).await.unwrap();
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
client.getpage_send(req).await.unwrap();
|
||||
inflight.push_back(start);
|
||||
}
|
||||
|
||||
let start = inflight.pop_front().unwrap();
|
||||
client.getpage_recv().await.unwrap();
|
||||
let end = Instant::now();
|
||||
live_stats.request_done();
|
||||
ticks_processed += 1;
|
||||
|
||||
@@ -25,6 +25,7 @@ use tokio_tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::Timeline;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
|
||||
@@ -123,6 +124,13 @@ where
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
io_concurrency: IoConcurrency::spawn_from_conf(
|
||||
timeline.conf,
|
||||
timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?,
|
||||
),
|
||||
};
|
||||
basebackup
|
||||
.send_tarball()
|
||||
@@ -144,6 +152,7 @@ where
|
||||
full_backup: bool,
|
||||
replica: bool,
|
||||
ctx: &'a RequestContext,
|
||||
io_concurrency: IoConcurrency,
|
||||
}
|
||||
|
||||
/// A sink that accepts SLRU blocks ordered by key and forwards
|
||||
@@ -303,7 +312,7 @@ where
|
||||
for part in slru_partitions.parts {
|
||||
let blocks = self
|
||||
.timeline
|
||||
.get_vectored(part, self.lsn, self.ctx)
|
||||
.get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
|
||||
@@ -358,7 +367,7 @@ where
|
||||
let start_time = Instant::now();
|
||||
let aux_files = self
|
||||
.timeline
|
||||
.list_aux_files(self.lsn, self.ctx)
|
||||
.list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
let aux_scan_time = start_time.elapsed();
|
||||
@@ -422,7 +431,7 @@ where
|
||||
}
|
||||
let repl_origins = self
|
||||
.timeline
|
||||
.get_replorigins(self.lsn, self.ctx)
|
||||
.get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
let n_origins = repl_origins.len();
|
||||
@@ -489,7 +498,13 @@ where
|
||||
for blknum in startblk..endblk {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
|
||||
.get_rel_page_at_lsn(
|
||||
src,
|
||||
blknum,
|
||||
Version::Lsn(self.lsn),
|
||||
self.ctx,
|
||||
self.io_concurrency.clone(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
|
||||
@@ -135,6 +135,7 @@ fn main() -> anyhow::Result<()> {
|
||||
info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
|
||||
info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
|
||||
info!(?conf.page_service_pipelining, "starting with page service pipelining config");
|
||||
info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");
|
||||
|
||||
// The tenants directory contains all the pageserver local disk state.
|
||||
// Create if not exists and make sure all the contents are durable before proceeding.
|
||||
|
||||
@@ -191,6 +191,8 @@ pub struct PageServerConf {
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
|
||||
pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
|
||||
|
||||
pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -352,6 +354,7 @@ impl PageServerConf {
|
||||
no_sync,
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -396,6 +399,7 @@ impl PageServerConf {
|
||||
import_pgdata_aws_endpoint_url,
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// fields that require additional validation or custom handling
|
||||
|
||||
@@ -984,6 +984,8 @@ components:
|
||||
type: string
|
||||
compaction_threshold:
|
||||
type: string
|
||||
compaction_upper_limit:
|
||||
type: string
|
||||
image_creation_threshold:
|
||||
type: integer
|
||||
walreceiver_connect_timeout:
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use enumset::EnumSet;
|
||||
use futures::future::join_all;
|
||||
use futures::StreamExt;
|
||||
use futures::TryFutureExt;
|
||||
use humantime::format_rfc3339;
|
||||
@@ -40,6 +41,7 @@ use pageserver_api::models::TenantShardSplitRequest;
|
||||
use pageserver_api::models::TenantShardSplitResponse;
|
||||
use pageserver_api::models::TenantSorting;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::TenantWaitLsnRequest;
|
||||
use pageserver_api::models::TimelineArchivalConfigRequest;
|
||||
use pageserver_api::models::TimelineCreateRequestMode;
|
||||
use pageserver_api::models::TimelineCreateRequestModeImportPgdata;
|
||||
@@ -84,6 +86,7 @@ use crate::tenant::remote_timeline_client::list_remote_tenant_shards;
|
||||
use crate::tenant::remote_timeline_client::list_remote_timelines;
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::timeline::import_pgdata;
|
||||
@@ -94,6 +97,8 @@ use crate::tenant::timeline::CompactOptions;
|
||||
use crate::tenant::timeline::CompactRequest;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::timeline::WaitLsnTimeout;
|
||||
use crate::tenant::timeline::WaitLsnWaiter;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::OffloadedTimeline;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
@@ -2789,6 +2794,63 @@ async fn secondary_download_handler(
|
||||
json_response(status, progress)
|
||||
}
|
||||
|
||||
async fn wait_lsn_handler(
|
||||
mut request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let wait_lsn_request: TenantWaitLsnRequest = json_request(&mut request).await?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
let mut wait_futures = Vec::default();
|
||||
for timeline in tenant.list_timelines() {
|
||||
let Some(lsn) = wait_lsn_request.timelines.get(&timeline.timeline_id) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let fut = {
|
||||
let timeline = timeline.clone();
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
|
||||
async move {
|
||||
timeline
|
||||
.wait_lsn(
|
||||
*lsn,
|
||||
WaitLsnWaiter::HttpEndpoint,
|
||||
WaitLsnTimeout::Custom(wait_lsn_request.timeout),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
};
|
||||
wait_futures.push(fut);
|
||||
}
|
||||
|
||||
if wait_futures.is_empty() {
|
||||
return json_response(StatusCode::NOT_FOUND, ());
|
||||
}
|
||||
|
||||
let all_done = tokio::select! {
|
||||
results = join_all(wait_futures) => {
|
||||
results.iter().all(|res| res.is_ok())
|
||||
},
|
||||
_ = cancel.cancelled() => {
|
||||
return Err(ApiError::Cancelled);
|
||||
}
|
||||
};
|
||||
|
||||
let status = if all_done {
|
||||
StatusCode::OK
|
||||
} else {
|
||||
StatusCode::ACCEPTED
|
||||
};
|
||||
|
||||
json_response(status, ())
|
||||
}
|
||||
|
||||
async fn secondary_status_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -2938,8 +3000,15 @@ async fn list_aux_files(
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
state.conf,
|
||||
timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
|
||||
);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
|
||||
let files = timeline
|
||||
.list_aux_files(body.lsn, &ctx, io_concurrency)
|
||||
.await?;
|
||||
json_response(StatusCode::OK, files)
|
||||
}
|
||||
|
||||
@@ -3569,6 +3638,9 @@ pub fn make_router(
|
||||
.post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
|
||||
api_handler(r, secondary_download_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/wait_lsn", |r| {
|
||||
api_handler(r, wait_lsn_handler)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_shard_id/break", |r| {
|
||||
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
||||
})
|
||||
|
||||
@@ -38,6 +38,9 @@ pub(crate) enum StorageTimeOperation {
|
||||
#[strum(serialize = "layer flush")]
|
||||
LayerFlush,
|
||||
|
||||
#[strum(serialize = "layer flush delay")]
|
||||
LayerFlushDelay,
|
||||
|
||||
#[strum(serialize = "compact")]
|
||||
Compact,
|
||||
|
||||
@@ -126,73 +129,6 @@ pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define metric")
|
||||
});
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
#[derive(
|
||||
Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
|
||||
)]
|
||||
pub(crate) enum GetKind {
|
||||
Singular,
|
||||
Vectored,
|
||||
}
|
||||
|
||||
pub(crate) struct ReconstructTimeMetrics {
|
||||
singular: Histogram,
|
||||
vectored: Histogram,
|
||||
}
|
||||
|
||||
pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_getpage_reconstruct_seconds",
|
||||
"Time spent in reconstruct_value (reconstruct a page from deltas)",
|
||||
&["get_kind"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
ReconstructTimeMetrics {
|
||||
singular: inner.with_label_values(&[GetKind::Singular.into()]),
|
||||
vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
|
||||
}
|
||||
});
|
||||
|
||||
impl ReconstructTimeMetrics {
|
||||
pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
|
||||
match get_kind {
|
||||
GetKind::Singular => &self.singular,
|
||||
GetKind::Vectored => &self.vectored,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct ReconstructDataTimeMetrics {
|
||||
singular: Histogram,
|
||||
vectored: Histogram,
|
||||
}
|
||||
|
||||
impl ReconstructDataTimeMetrics {
|
||||
pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
|
||||
match get_kind {
|
||||
GetKind::Singular => &self.singular,
|
||||
GetKind::Vectored => &self.vectored,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_getpage_get_reconstruct_data_seconds",
|
||||
"Time spent in get_reconstruct_value_data",
|
||||
&["get_kind"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
ReconstructDataTimeMetrics {
|
||||
singular: inner.with_label_values(&[GetKind::Singular.into()]),
|
||||
vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct GetVectoredLatency {
|
||||
map: EnumMap<TaskKind, Option<Histogram>>,
|
||||
}
|
||||
@@ -2550,12 +2486,19 @@ impl StorageTimeMetricsTimer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Record the time from creation to now.
|
||||
pub fn stop_and_record(self) {
|
||||
let duration = self.start.elapsed().as_secs_f64();
|
||||
self.metrics.timeline_sum.inc_by(duration);
|
||||
/// Returns the elapsed duration of the timer.
|
||||
pub fn elapsed(&self) -> Duration {
|
||||
self.start.elapsed()
|
||||
}
|
||||
|
||||
/// Record the time from creation to now and return it.
|
||||
pub fn stop_and_record(self) -> Duration {
|
||||
let duration = self.elapsed();
|
||||
let seconds = duration.as_secs_f64();
|
||||
self.metrics.timeline_sum.inc_by(seconds);
|
||||
self.metrics.timeline_count.inc();
|
||||
self.metrics.global_histogram.observe(duration);
|
||||
self.metrics.global_histogram.observe(seconds);
|
||||
duration
|
||||
}
|
||||
|
||||
/// Turns this timer into a timer, which will always record -- usually this means recording
|
||||
@@ -2575,6 +2518,13 @@ impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
|
||||
}
|
||||
}
|
||||
|
||||
impl AlwaysRecordingStorageTimeMetricsTimer {
|
||||
/// Returns the elapsed duration of the timer.
|
||||
pub fn elapsed(&self) -> Duration {
|
||||
self.0.as_ref().expect("not dropped yet").elapsed()
|
||||
}
|
||||
}
|
||||
|
||||
/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
|
||||
/// timeline total sum and count.
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -2627,6 +2577,7 @@ pub(crate) struct TimelineMetrics {
|
||||
shard_id: String,
|
||||
timeline_id: String,
|
||||
pub flush_time_histo: StorageTimeMetrics,
|
||||
pub flush_delay_histo: StorageTimeMetrics,
|
||||
pub flush_wait_upload_time_gauge: Gauge,
|
||||
pub compact_time_histo: StorageTimeMetrics,
|
||||
pub create_images_time_histo: StorageTimeMetrics,
|
||||
@@ -2673,6 +2624,12 @@ impl TimelineMetrics {
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let flush_delay_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::LayerFlushDelay,
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -2821,6 +2778,7 @@ impl TimelineMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
flush_time_histo,
|
||||
flush_delay_histo,
|
||||
flush_wait_upload_time_gauge,
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
@@ -3919,7 +3877,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
|
||||
});
|
||||
|
||||
// Custom
|
||||
Lazy::force(&RECONSTRUCT_TIME);
|
||||
Lazy::force(&BASEBACKUP_QUERY_TIME);
|
||||
Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
|
||||
Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);
|
||||
|
||||
@@ -39,6 +39,7 @@ use tokio::io::{AsyncWriteExt, BufWriter};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::sync::gate::{Gate, GateGuard};
|
||||
use utils::sync::spsc_fold;
|
||||
use utils::{
|
||||
auth::{Claims, Scope, SwappableJwtAuth},
|
||||
@@ -61,6 +62,7 @@ use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME};
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::tenant::mgr::TenantManager;
|
||||
use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult};
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::{self, WaitLsnError};
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
@@ -90,6 +92,7 @@ pub struct Listener {
|
||||
pub struct Connections {
|
||||
cancel: CancellationToken,
|
||||
tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
|
||||
gate: Gate,
|
||||
}
|
||||
|
||||
pub fn spawn(
|
||||
@@ -110,6 +113,7 @@ pub fn spawn(
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"libpq listener",
|
||||
libpq_listener_main(
|
||||
conf,
|
||||
tenant_manager,
|
||||
pg_auth,
|
||||
tcp_listener,
|
||||
@@ -134,11 +138,16 @@ impl Listener {
|
||||
}
|
||||
impl Connections {
|
||||
pub(crate) async fn shutdown(self) {
|
||||
let Self { cancel, mut tasks } = self;
|
||||
let Self {
|
||||
cancel,
|
||||
mut tasks,
|
||||
gate,
|
||||
} = self;
|
||||
cancel.cancel();
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
Self::handle_connection_completion(res);
|
||||
}
|
||||
gate.close().await;
|
||||
}
|
||||
|
||||
fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
|
||||
@@ -158,7 +167,9 @@ impl Connections {
|
||||
/// Returns Ok(()) upon cancellation via `cancel`, returning the set of
|
||||
/// open connections.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn libpq_listener_main(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
listener: tokio::net::TcpListener,
|
||||
@@ -168,9 +179,15 @@ pub async fn libpq_listener_main(
|
||||
listener_cancel: CancellationToken,
|
||||
) -> Connections {
|
||||
let connections_cancel = CancellationToken::new();
|
||||
let connections_gate = Gate::default();
|
||||
let mut connection_handler_tasks = tokio::task::JoinSet::default();
|
||||
|
||||
loop {
|
||||
let gate_guard = match connections_gate.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => break,
|
||||
};
|
||||
|
||||
let accepted = tokio::select! {
|
||||
biased;
|
||||
_ = listener_cancel.cancelled() => break,
|
||||
@@ -190,6 +207,7 @@ pub async fn libpq_listener_main(
|
||||
let connection_ctx = listener_ctx
|
||||
.detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
|
||||
connection_handler_tasks.spawn(page_service_conn_main(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
local_auth,
|
||||
socket,
|
||||
@@ -197,6 +215,7 @@ pub async fn libpq_listener_main(
|
||||
pipelining_config.clone(),
|
||||
connection_ctx,
|
||||
connections_cancel.child_token(),
|
||||
gate_guard,
|
||||
));
|
||||
}
|
||||
Err(err) => {
|
||||
@@ -211,13 +230,16 @@ pub async fn libpq_listener_main(
|
||||
Connections {
|
||||
cancel: connections_cancel,
|
||||
tasks: connection_handler_tasks,
|
||||
gate: connections_gate,
|
||||
}
|
||||
}
|
||||
|
||||
type ConnectionHandlerResult = anyhow::Result<()>;
|
||||
|
||||
#[instrument(skip_all, fields(peer_addr))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn page_service_conn_main(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
@@ -225,6 +247,7 @@ async fn page_service_conn_main(
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
) -> ConnectionHandlerResult {
|
||||
let _guard = LIVE_CONNECTIONS
|
||||
.with_label_values(&["page_service"])
|
||||
@@ -274,11 +297,13 @@ async fn page_service_conn_main(
|
||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||
// and create the per-query context in process_query ourselves.
|
||||
let mut conn_handler = PageServerHandler::new(
|
||||
conf,
|
||||
tenant_manager,
|
||||
auth,
|
||||
pipelining_config,
|
||||
connection_ctx,
|
||||
cancel.clone(),
|
||||
gate_guard,
|
||||
);
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
|
||||
@@ -310,6 +335,7 @@ async fn page_service_conn_main(
|
||||
}
|
||||
|
||||
struct PageServerHandler {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
claims: Option<Claims>,
|
||||
|
||||
@@ -325,6 +351,8 @@ struct PageServerHandler {
|
||||
timeline_handles: Option<TimelineHandles>,
|
||||
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
|
||||
gate_guard: GateGuard,
|
||||
}
|
||||
|
||||
struct TimelineHandles {
|
||||
@@ -634,19 +662,23 @@ impl BatchedFeMessage {
|
||||
|
||||
impl PageServerHandler {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
) -> Self {
|
||||
PageServerHandler {
|
||||
conf,
|
||||
auth,
|
||||
claims: None,
|
||||
connection_ctx,
|
||||
timeline_handles: Some(TimelineHandles::new(tenant_manager)),
|
||||
cancel,
|
||||
pipelining_config,
|
||||
gate_guard,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1015,6 +1047,7 @@ impl PageServerHandler {
|
||||
&mut self,
|
||||
pgb_writer: &mut PostgresBackend<IO>,
|
||||
batch: BatchedFeMessage,
|
||||
io_concurrency: IoConcurrency,
|
||||
cancel: &CancellationToken,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
ctx: &RequestContext,
|
||||
@@ -1084,6 +1117,7 @@ impl PageServerHandler {
|
||||
&*shard.upgrade()?,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
io_concurrency,
|
||||
ctx,
|
||||
)
|
||||
.instrument(span.clone())
|
||||
@@ -1288,6 +1322,17 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf,
|
||||
match self.gate_guard.try_clone() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
info!("shutdown request received in page handler");
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
let pgb_reader = pgb
|
||||
.split()
|
||||
.context("implementation error: split pgb into reader and writer")?;
|
||||
@@ -1309,6 +1354,7 @@ impl PageServerHandler {
|
||||
request_span,
|
||||
pipelining_config,
|
||||
protocol_version,
|
||||
io_concurrency,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1322,6 +1368,7 @@ impl PageServerHandler {
|
||||
timeline_handles,
|
||||
request_span,
|
||||
protocol_version,
|
||||
io_concurrency,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1349,6 +1396,7 @@ impl PageServerHandler {
|
||||
mut timeline_handles: TimelineHandles,
|
||||
request_span: Span,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
io_concurrency: IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
) -> (
|
||||
(PostgresBackendReader<IO>, TimelineHandles),
|
||||
@@ -1383,7 +1431,14 @@ impl PageServerHandler {
|
||||
};
|
||||
|
||||
let err = self
|
||||
.pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
|
||||
.pagesteam_handle_batched_message(
|
||||
pgb_writer,
|
||||
msg,
|
||||
io_concurrency.clone(),
|
||||
&cancel,
|
||||
protocol_version,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
match err {
|
||||
Ok(()) => {}
|
||||
@@ -1407,6 +1462,7 @@ impl PageServerHandler {
|
||||
request_span: Span,
|
||||
pipelining_config: PageServicePipeliningConfigPipelined,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
io_concurrency: IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
) -> (
|
||||
(PostgresBackendReader<IO>, TimelineHandles),
|
||||
@@ -1550,6 +1606,7 @@ impl PageServerHandler {
|
||||
self.pagesteam_handle_batched_message(
|
||||
pgb_writer,
|
||||
batch,
|
||||
io_concurrency.clone(),
|
||||
&cancel,
|
||||
protocol_version,
|
||||
&ctx,
|
||||
@@ -1651,6 +1708,7 @@ impl PageServerHandler {
|
||||
.wait_lsn(
|
||||
not_modified_since,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
timeline::WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -1806,6 +1864,7 @@ impl PageServerHandler {
|
||||
timeline: &Timeline,
|
||||
effective_lsn: Lsn,
|
||||
requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
io_concurrency: IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
@@ -1832,6 +1891,7 @@ impl PageServerHandler {
|
||||
.get_rel_page_at_lsn_batched(
|
||||
requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
|
||||
effective_lsn,
|
||||
io_concurrency,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
@@ -1985,6 +2045,7 @@ impl PageServerHandler {
|
||||
.wait_lsn(
|
||||
lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
crate::tenant::timeline::WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -17,6 +17,7 @@ use crate::span::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
|
||||
};
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
@@ -200,6 +201,7 @@ impl Timeline {
|
||||
blknum: BlockNumber,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
match version {
|
||||
Version::Lsn(effective_lsn) => {
|
||||
@@ -208,6 +210,7 @@ impl Timeline {
|
||||
.get_rel_page_at_lsn_batched(
|
||||
pages.iter().map(|(tag, blknum)| (tag, blknum)),
|
||||
effective_lsn,
|
||||
io_concurrency.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
@@ -246,6 +249,7 @@ impl Timeline {
|
||||
&self,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber)>,
|
||||
effective_lsn: Lsn,
|
||||
io_concurrency: IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<Result<Bytes, PageReconstructError>> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
@@ -309,7 +313,10 @@ impl Timeline {
|
||||
acc.to_keyspace()
|
||||
};
|
||||
|
||||
match self.get_vectored(keyspace, effective_lsn, ctx).await {
|
||||
match self
|
||||
.get_vectored(keyspace, effective_lsn, io_concurrency, ctx)
|
||||
.await
|
||||
{
|
||||
Ok(results) => {
|
||||
for (key, res) in results {
|
||||
let mut key_slots = keys_slots.remove(&key).unwrap().into_iter();
|
||||
@@ -889,9 +896,15 @@ impl Timeline {
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
let kv = self
|
||||
.scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
|
||||
.scan(
|
||||
KeySpace::single(Key::metadata_aux_key_range()),
|
||||
lsn,
|
||||
ctx,
|
||||
io_concurrency,
|
||||
)
|
||||
.await?;
|
||||
let mut result = HashMap::new();
|
||||
let mut sz = 0;
|
||||
@@ -914,8 +927,9 @@ impl Timeline {
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
self.list_aux_files_v2(lsn, ctx).await?;
|
||||
self.list_aux_files_v2(lsn, ctx, io_concurrency).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -923,17 +937,24 @@ impl Timeline {
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
self.list_aux_files_v2(lsn, ctx).await
|
||||
self.list_aux_files_v2(lsn, ctx, io_concurrency).await
|
||||
}
|
||||
|
||||
pub(crate) async fn get_replorigins(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
|
||||
let kv = self
|
||||
.scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
|
||||
.scan(
|
||||
KeySpace::single(repl_origin_key_range()),
|
||||
lsn,
|
||||
ctx,
|
||||
io_concurrency,
|
||||
)
|
||||
.await?;
|
||||
let mut result = HashMap::new();
|
||||
for (k, v) in kv {
|
||||
@@ -2432,7 +2453,11 @@ mod tests {
|
||||
("foo/bar2".to_string(), Bytes::from_static(b"content2")),
|
||||
]);
|
||||
|
||||
let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
|
||||
let readback = tline
|
||||
.list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone())
|
||||
.await?;
|
||||
assert_eq!(readback, expect_1008);
|
||||
|
||||
// Second modification: update one key, remove the other
|
||||
@@ -2444,11 +2469,15 @@ mod tests {
|
||||
let expect_2008 =
|
||||
HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]);
|
||||
|
||||
let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?;
|
||||
let readback = tline
|
||||
.list_aux_files(Lsn(0x2008), &ctx, io_concurrency.clone())
|
||||
.await?;
|
||||
assert_eq!(readback, expect_2008);
|
||||
|
||||
// Reading back in time works
|
||||
let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
|
||||
let readback = tline
|
||||
.list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone())
|
||||
.await?;
|
||||
assert_eq!(readback, expect_1008);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -37,6 +37,8 @@ use remote_timeline_client::manifest::{
|
||||
OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION,
|
||||
};
|
||||
use remote_timeline_client::UploadQueueNotReadyError;
|
||||
use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
|
||||
use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
@@ -2558,7 +2560,12 @@ impl Tenant {
|
||||
// sizes etc. and that would get confused if the previous page versions
|
||||
// are not in the repository yet.
|
||||
ancestor_timeline
|
||||
.wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
|
||||
.wait_lsn(
|
||||
*lsn,
|
||||
timeline::WaitLsnWaiter::Tenant,
|
||||
timeline::WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
|
||||
@@ -3809,6 +3816,13 @@ impl Tenant {
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||
}
|
||||
|
||||
pub fn get_compaction_upper_limit(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.compaction_upper_limit
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
|
||||
}
|
||||
|
||||
pub fn get_gc_horizon(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
@@ -5308,27 +5322,37 @@ impl Tenant {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
self.generation,
|
||||
&manifest,
|
||||
// Remote storage does no retries internally, so wrap it
|
||||
match backoff::retry(
|
||||
|| async {
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
self.generation,
|
||||
&manifest,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
},
|
||||
|_e| self.cancel.is_cancelled(),
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"uploading tenant manifest",
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
if self.cancel.is_cancelled() {
|
||||
TenantManifestError::Cancelled
|
||||
} else {
|
||||
TenantManifestError::RemoteStorage(e)
|
||||
{
|
||||
None => Err(TenantManifestError::Cancelled),
|
||||
Some(Err(_)) if self.cancel.is_cancelled() => Err(TenantManifestError::Cancelled),
|
||||
Some(Err(e)) => Err(TenantManifestError::RemoteStorage(e)),
|
||||
Some(Ok(_)) => {
|
||||
// Store the successfully uploaded manifest, so that future callers can avoid
|
||||
// re-uploading the same thing.
|
||||
*guard = Some(manifest);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
})?;
|
||||
|
||||
// Store the successfully uploaded manifest, so that future callers can avoid
|
||||
// re-uploading the same thing.
|
||||
*guard = Some(manifest);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5452,7 +5476,11 @@ pub(crate) mod harness {
|
||||
compaction_target_size: Some(tenant_conf.compaction_target_size),
|
||||
compaction_period: Some(tenant_conf.compaction_period),
|
||||
compaction_threshold: Some(tenant_conf.compaction_threshold),
|
||||
compaction_upper_limit: Some(tenant_conf.compaction_upper_limit),
|
||||
compaction_algorithm: Some(tenant_conf.compaction_algorithm),
|
||||
l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
|
||||
l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
|
||||
l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload),
|
||||
gc_horizon: Some(tenant_conf.gc_horizon),
|
||||
gc_period: Some(tenant_conf.gc_period),
|
||||
image_creation_threshold: Some(tenant_conf.image_creation_threshold),
|
||||
@@ -5476,6 +5504,11 @@ pub(crate) mod harness {
|
||||
timeline_offloading: Some(tenant_conf.timeline_offloading),
|
||||
wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
|
||||
rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled,
|
||||
gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled),
|
||||
gc_compaction_initial_threshold_kb: Some(
|
||||
tenant_conf.gc_compaction_initial_threshold_kb,
|
||||
),
|
||||
gc_compaction_ratio_percent: Some(tenant_conf.gc_compaction_ratio_percent),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5709,7 +5742,7 @@ mod tests {
|
||||
use pageserver_api::value::Value;
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
use rand::{thread_rng, Rng};
|
||||
use storage_layer::PersistentLayerKey;
|
||||
use storage_layer::{IoConcurrency, PersistentLayerKey};
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
use timeline::{CompactOptions, DeltaLayerTestDesc};
|
||||
@@ -6490,6 +6523,7 @@ mod tests {
|
||||
async fn test_get_vectored() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_get_vectored").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -6554,7 +6588,7 @@ mod tests {
|
||||
.get_vectored_impl(
|
||||
read.clone(),
|
||||
reads_lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
@@ -6601,6 +6635,7 @@ mod tests {
|
||||
let harness = TenantHarness::create("test_get_vectored_aux_files").await?;
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -6635,7 +6670,7 @@ mod tests {
|
||||
.get_vectored_impl(
|
||||
aux_keyspace.clone(),
|
||||
read_lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
@@ -6683,6 +6718,7 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
|
||||
let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let gap_at_key = current_key.add(100);
|
||||
@@ -6783,7 +6819,7 @@ mod tests {
|
||||
.get_vectored_impl(
|
||||
read.clone(),
|
||||
current_lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -6826,6 +6862,7 @@ mod tests {
|
||||
async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
|
||||
let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let end_key = start_key.add(1000);
|
||||
@@ -6918,7 +6955,7 @@ mod tests {
|
||||
ranges: vec![child_gap_at_key..child_gap_at_key.next()],
|
||||
},
|
||||
query_lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
@@ -7364,6 +7401,7 @@ mod tests {
|
||||
async fn test_metadata_scan() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_scan").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -7417,7 +7455,7 @@ mod tests {
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
&mut ValuesReconstructState::default(),
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
.await?
|
||||
@@ -7532,6 +7570,7 @@ mod tests {
|
||||
let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap();
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
|
||||
let mut lsn = Lsn(0x08);
|
||||
|
||||
@@ -7551,7 +7590,10 @@ mod tests {
|
||||
}
|
||||
|
||||
// we can read everything from the storage
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
let files = tline
|
||||
.list_aux_files(lsn, &ctx, io_concurrency.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first"))
|
||||
@@ -7567,7 +7609,10 @@ mod tests {
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
let files = tline
|
||||
.list_aux_files(lsn, &ctx, io_concurrency.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test2"),
|
||||
Some(&bytes::Bytes::from_static(b"second"))
|
||||
@@ -7578,7 +7623,10 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
let files = child
|
||||
.list_aux_files(lsn, &ctx, io_concurrency.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||
assert_eq!(files.get("pg_logical/mappings/test2"), None);
|
||||
}
|
||||
@@ -7587,6 +7635,7 @@ mod tests {
|
||||
async fn test_metadata_image_creation() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_image_creation").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -7606,8 +7655,9 @@ mod tests {
|
||||
keyspace: &KeySpace,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
|
||||
let mut reconstruct_state = ValuesReconstructState::default();
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
|
||||
let res = tline
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
@@ -7655,7 +7705,8 @@ mod tests {
|
||||
|
||||
if iter % 5 == 0 {
|
||||
let (_, before_delta_file_accessed) =
|
||||
scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
|
||||
scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone())
|
||||
.await?;
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
@@ -7669,7 +7720,8 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
let (_, after_delta_file_accessed) =
|
||||
scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
|
||||
scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone())
|
||||
.await?;
|
||||
assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
|
||||
// Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
|
||||
assert!(
|
||||
@@ -7758,6 +7810,7 @@ mod tests {
|
||||
async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
|
||||
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
@@ -7896,7 +7949,7 @@ mod tests {
|
||||
);
|
||||
|
||||
// test vectored scan on parent timeline
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
|
||||
let res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(Key::metadata_key_range()),
|
||||
@@ -7922,7 +7975,7 @@ mod tests {
|
||||
);
|
||||
|
||||
// test vectored scan on child timeline
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
|
||||
let res = child
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(Key::metadata_key_range()),
|
||||
@@ -7960,7 +8013,9 @@ mod tests {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Bytes>, GetVectoredError> {
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let io_concurrency =
|
||||
IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
|
||||
let mut res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(key..key.next()),
|
||||
@@ -8061,6 +8116,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
|
||||
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
@@ -8120,7 +8176,7 @@ mod tests {
|
||||
|
||||
// Image layers are created at last_record_lsn
|
||||
let images = tline
|
||||
.inspect_image_layers(Lsn(0x40), &ctx)
|
||||
.inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
@@ -8135,6 +8191,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
|
||||
@@ -8185,7 +8242,7 @@ mod tests {
|
||||
|
||||
// Image layers are created at last_record_lsn
|
||||
let images = tline
|
||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
||||
.inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone())
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
@@ -8198,6 +8255,7 @@ mod tests {
|
||||
async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
@@ -8339,7 +8397,7 @@ mod tests {
|
||||
|
||||
// Check if the image layer at the GC horizon contains exactly what we want
|
||||
let image_at_gc_horizon = tline
|
||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
||||
.inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone())
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
@@ -10052,7 +10110,12 @@ mod tests {
|
||||
|
||||
let keyspace = KeySpace::single(get_key(0)..get_key(10));
|
||||
let results = tline
|
||||
.get_vectored(keyspace, delta_layer_end_lsn, &ctx)
|
||||
.get_vectored(
|
||||
keyspace,
|
||||
delta_layer_end_lsn,
|
||||
IoConcurrency::sequential(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.expect("No vectored errors");
|
||||
for (key, res) in results {
|
||||
|
||||
@@ -277,10 +277,26 @@ pub struct TenantConfOpt {
|
||||
#[serde(default)]
|
||||
pub compaction_threshold: Option<usize>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub compaction_upper_limit: Option<usize>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub l0_flush_delay_threshold: Option<usize>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub l0_flush_stall_threshold: Option<usize>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub l0_flush_wait_upload: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub gc_horizon: Option<u64>,
|
||||
@@ -360,6 +376,15 @@ pub struct TenantConfOpt {
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub rel_size_v2_enabled: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_enabled: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_initial_threshold_kb: Option<u64>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_ratio_percent: Option<u64>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -380,11 +405,23 @@ impl TenantConfOpt {
|
||||
compaction_threshold: self
|
||||
.compaction_threshold
|
||||
.unwrap_or(global_conf.compaction_threshold),
|
||||
compaction_upper_limit: self
|
||||
.compaction_upper_limit
|
||||
.unwrap_or(global_conf.compaction_upper_limit),
|
||||
compaction_algorithm: self
|
||||
.compaction_algorithm
|
||||
.as_ref()
|
||||
.unwrap_or(&global_conf.compaction_algorithm)
|
||||
.clone(),
|
||||
l0_flush_delay_threshold: self
|
||||
.l0_flush_delay_threshold
|
||||
.or(global_conf.l0_flush_delay_threshold),
|
||||
l0_flush_stall_threshold: self
|
||||
.l0_flush_stall_threshold
|
||||
.or(global_conf.l0_flush_stall_threshold),
|
||||
l0_flush_wait_upload: self
|
||||
.l0_flush_wait_upload
|
||||
.unwrap_or(global_conf.l0_flush_wait_upload),
|
||||
gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
|
||||
gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
|
||||
image_creation_threshold: self
|
||||
@@ -429,6 +466,15 @@ impl TenantConfOpt {
|
||||
.wal_receiver_protocol_override
|
||||
.or(global_conf.wal_receiver_protocol_override),
|
||||
rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled),
|
||||
gc_compaction_enabled: self
|
||||
.gc_compaction_enabled
|
||||
.unwrap_or(global_conf.gc_compaction_enabled),
|
||||
gc_compaction_initial_threshold_kb: self
|
||||
.gc_compaction_initial_threshold_kb
|
||||
.unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
|
||||
gc_compaction_ratio_percent: self
|
||||
.gc_compaction_ratio_percent
|
||||
.unwrap_or(global_conf.gc_compaction_ratio_percent),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -439,7 +485,11 @@ impl TenantConfOpt {
|
||||
mut compaction_target_size,
|
||||
mut compaction_period,
|
||||
mut compaction_threshold,
|
||||
mut compaction_upper_limit,
|
||||
mut compaction_algorithm,
|
||||
mut l0_flush_delay_threshold,
|
||||
mut l0_flush_stall_threshold,
|
||||
mut l0_flush_wait_upload,
|
||||
mut gc_horizon,
|
||||
mut gc_period,
|
||||
mut image_creation_threshold,
|
||||
@@ -459,6 +509,9 @@ impl TenantConfOpt {
|
||||
mut timeline_offloading,
|
||||
mut wal_receiver_protocol_override,
|
||||
mut rel_size_v2_enabled,
|
||||
mut gc_compaction_enabled,
|
||||
mut gc_compaction_initial_threshold_kb,
|
||||
mut gc_compaction_ratio_percent,
|
||||
} = self;
|
||||
|
||||
patch.checkpoint_distance.apply(&mut checkpoint_distance);
|
||||
@@ -474,7 +527,17 @@ impl TenantConfOpt {
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut compaction_period);
|
||||
patch.compaction_threshold.apply(&mut compaction_threshold);
|
||||
patch
|
||||
.compaction_upper_limit
|
||||
.apply(&mut compaction_upper_limit);
|
||||
patch.compaction_algorithm.apply(&mut compaction_algorithm);
|
||||
patch
|
||||
.l0_flush_delay_threshold
|
||||
.apply(&mut l0_flush_delay_threshold);
|
||||
patch
|
||||
.l0_flush_stall_threshold
|
||||
.apply(&mut l0_flush_stall_threshold);
|
||||
patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
|
||||
patch.gc_horizon.apply(&mut gc_horizon);
|
||||
patch
|
||||
.gc_period
|
||||
@@ -528,6 +591,15 @@ impl TenantConfOpt {
|
||||
.wal_receiver_protocol_override
|
||||
.apply(&mut wal_receiver_protocol_override);
|
||||
patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
|
||||
patch
|
||||
.gc_compaction_enabled
|
||||
.apply(&mut gc_compaction_enabled);
|
||||
patch
|
||||
.gc_compaction_initial_threshold_kb
|
||||
.apply(&mut gc_compaction_initial_threshold_kb);
|
||||
patch
|
||||
.gc_compaction_ratio_percent
|
||||
.apply(&mut gc_compaction_ratio_percent);
|
||||
|
||||
Ok(Self {
|
||||
checkpoint_distance,
|
||||
@@ -535,7 +607,11 @@ impl TenantConfOpt {
|
||||
compaction_target_size,
|
||||
compaction_period,
|
||||
compaction_threshold,
|
||||
compaction_upper_limit,
|
||||
compaction_algorithm,
|
||||
l0_flush_delay_threshold,
|
||||
l0_flush_stall_threshold,
|
||||
l0_flush_wait_upload,
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
image_creation_threshold,
|
||||
@@ -555,6 +631,9 @@ impl TenantConfOpt {
|
||||
timeline_offloading,
|
||||
wal_receiver_protocol_override,
|
||||
rel_size_v2_enabled,
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -590,6 +669,10 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
compaction_target_size: value.compaction_target_size,
|
||||
compaction_period: value.compaction_period.map(humantime),
|
||||
compaction_threshold: value.compaction_threshold,
|
||||
compaction_upper_limit: value.compaction_upper_limit,
|
||||
l0_flush_delay_threshold: value.l0_flush_delay_threshold,
|
||||
l0_flush_stall_threshold: value.l0_flush_stall_threshold,
|
||||
l0_flush_wait_upload: value.l0_flush_wait_upload,
|
||||
gc_horizon: value.gc_horizon,
|
||||
gc_period: value.gc_period.map(humantime),
|
||||
image_creation_threshold: value.image_creation_threshold,
|
||||
@@ -611,6 +694,9 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
timeline_offloading: value.timeline_offloading,
|
||||
wal_receiver_protocol_override: value.wal_receiver_protocol_override,
|
||||
rel_size_v2_enabled: value.rel_size_v2_enabled,
|
||||
gc_compaction_enabled: value.gc_compaction_enabled,
|
||||
gc_compaction_initial_threshold_kb: value.gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent: value.gc_compaction_ratio_percent,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,6 +57,7 @@ use std::collections::{HashMap, VecDeque};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::watch;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||
@@ -67,7 +68,6 @@ use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct LayerMap {
|
||||
//
|
||||
// 'open_layer' holds the current InMemoryLayer that is accepting new
|
||||
@@ -93,7 +93,25 @@ pub struct LayerMap {
|
||||
|
||||
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||
///
|
||||
/// NB: make sure to notify `watch_l0_deltas` on changes.
|
||||
l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
|
||||
|
||||
/// Notifies about L0 delta layer changes, sending the current number of L0 layers.
|
||||
watch_l0_deltas: watch::Sender<usize>,
|
||||
}
|
||||
|
||||
impl Default for LayerMap {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
open_layer: Default::default(),
|
||||
next_open_layer_at: Default::default(),
|
||||
frozen_layers: Default::default(),
|
||||
historic: Default::default(),
|
||||
l0_delta_layers: Default::default(),
|
||||
watch_l0_deltas: watch::channel(0).0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The primary update API for the layer map.
|
||||
@@ -466,6 +484,8 @@ impl LayerMap {
|
||||
|
||||
if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
|
||||
self.l0_delta_layers.push(layer_desc.clone().into());
|
||||
self.watch_l0_deltas
|
||||
.send_replace(self.l0_delta_layers.len());
|
||||
}
|
||||
|
||||
self.historic.insert(
|
||||
@@ -488,6 +508,8 @@ impl LayerMap {
|
||||
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
|
||||
l0_delta_layers.retain(|other| other.key() != layer_key);
|
||||
self.l0_delta_layers = l0_delta_layers;
|
||||
self.watch_l0_deltas
|
||||
.send_replace(self.l0_delta_layers.len());
|
||||
// this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
|
||||
// there's a chance that the comparison fails at runtime due to it comparing (pointer,
|
||||
// vtable) pairs.
|
||||
@@ -850,6 +872,11 @@ impl LayerMap {
|
||||
&self.l0_delta_layers
|
||||
}
|
||||
|
||||
/// Subscribes to L0 delta layer changes, sending the current number of L0 delta layers.
|
||||
pub fn watch_level0_deltas(&self) -> watch::Receiver<usize> {
|
||||
self.watch_l0_deltas.subscribe()
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
#[allow(unused)]
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
|
||||
@@ -1643,6 +1643,7 @@ impl TenantManager {
|
||||
.wait_lsn(
|
||||
*target_lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::Tenant,
|
||||
crate::tenant::timeline::WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -77,11 +77,17 @@ pub struct IndexPart {
|
||||
///
|
||||
/// None means no aux files have been written to the storage before the point
|
||||
/// when this flag is introduced.
|
||||
///
|
||||
/// This flag is not used any more as all tenants have been transitioned to the new aux file policy.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) rel_size_migration: Option<RelSizeMigration>,
|
||||
|
||||
/// The LSN of gc-compaction horizon. Once gc-compaction is finished for all layer files below an LSN, this LSN will be updated.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) l2_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -116,10 +122,11 @@ impl IndexPart {
|
||||
/// - 9: +gc_blocking
|
||||
/// - 10: +import_pgdata
|
||||
/// - 11: +rel_size_migration
|
||||
const LATEST_VERSION: usize = 11;
|
||||
/// - 12: +l2_lsn
|
||||
const LATEST_VERSION: usize = 12;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -136,6 +143,7 @@ impl IndexPart {
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -214,6 +222,10 @@ impl LayerFileMetadata {
|
||||
shard,
|
||||
}
|
||||
}
|
||||
/// Helper to get both generation and file size in a tuple
|
||||
pub fn generation_file_size(&self) -> (Generation, u64) {
|
||||
(self.generation, self.file_size)
|
||||
}
|
||||
}
|
||||
|
||||
/// Limited history of earlier ancestors.
|
||||
@@ -437,6 +449,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -483,6 +496,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -530,6 +544,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -580,6 +595,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
|
||||
@@ -625,6 +641,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -673,6 +690,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -726,6 +744,7 @@ mod tests {
|
||||
last_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
import_pgdata: None,
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -784,6 +803,7 @@ mod tests {
|
||||
last_aux_file_policy: Default::default(),
|
||||
import_pgdata: None,
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -843,6 +863,7 @@ mod tests {
|
||||
last_aux_file_policy: Default::default(),
|
||||
import_pgdata: None,
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -907,6 +928,7 @@ mod tests {
|
||||
archived_at: None,
|
||||
import_pgdata: None,
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -984,6 +1006,7 @@ mod tests {
|
||||
idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
|
||||
}))),
|
||||
rel_size_migration: None,
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -1062,6 +1085,87 @@ mod tests {
|
||||
idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
|
||||
}))),
|
||||
rel_size_migration: Some(RelSizeMigration::Legacy),
|
||||
l2_lsn: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v12_l2_lsn_is_parsed() {
|
||||
let example = r#"{
|
||||
"version": 12,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata": {
|
||||
"disk_consistent_lsn": "0/16960E8",
|
||||
"prev_record_lsn": "0/1696070",
|
||||
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
|
||||
"ancestor_lsn": "0/0",
|
||||
"latest_gc_cutoff_lsn": "0/1696070",
|
||||
"initdb_lsn": "0/1696070",
|
||||
"pg_version": 14
|
||||
},
|
||||
"gc_blocking": {
|
||||
"started_at": "2024-07-19T09:00:00.123",
|
||||
"reasons": ["DetachAncestor"]
|
||||
},
|
||||
"import_pgdata": {
|
||||
"V1": {
|
||||
"Done": {
|
||||
"idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5",
|
||||
"started_at": "2024-11-13T09:23:42.123",
|
||||
"finished_at": "2024-11-13T09:42:23.123"
|
||||
}
|
||||
}
|
||||
},
|
||||
"rel_size_migration": "legacy",
|
||||
"l2_lsn": "0/16960E8"
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 12,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::new(
|
||||
Lsn::from_str("0/16960E8").unwrap(),
|
||||
Some(Lsn::from_str("0/1696070").unwrap()),
|
||||
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: Some(GcBlocking {
|
||||
started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
|
||||
reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
|
||||
}),
|
||||
last_aux_file_policy: Default::default(),
|
||||
archived_at: None,
|
||||
import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{
|
||||
started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
|
||||
finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
|
||||
idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
|
||||
}))),
|
||||
rel_size_migration: Some(RelSizeMigration::Legacy),
|
||||
l2_lsn: Some("0/16960E8".parse::<Lsn>().unwrap()),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
|
||||
@@ -559,6 +559,13 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
}
|
||||
}
|
||||
|
||||
enum LayerAction {
|
||||
Download,
|
||||
NoAction,
|
||||
Skip,
|
||||
Touch,
|
||||
}
|
||||
|
||||
/// This type is a convenience to group together the various functions involved in
|
||||
/// freshening a secondary tenant.
|
||||
struct TenantDownloader<'a> {
|
||||
@@ -1008,69 +1015,17 @@ impl<'a> TenantDownloader<'a> {
|
||||
return (Err(UpdateError::Restart), touched);
|
||||
}
|
||||
|
||||
// Existing on-disk layers: just update their access time.
|
||||
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
|
||||
tracing::debug!("Layer {} is already on disk", layer.name);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
|
||||
// are already present on disk are really there.
|
||||
match tokio::fs::metadata(&on_disk.local_path).await {
|
||||
Ok(meta) => {
|
||||
tracing::debug!(
|
||||
"Layer {} present at {}, size {}",
|
||||
layer.name,
|
||||
on_disk.local_path,
|
||||
meta.len(),
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Layer {} not found at {} ({})",
|
||||
layer.name,
|
||||
on_disk.local_path,
|
||||
e
|
||||
);
|
||||
debug_assert!(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
|
||||
// We already have this layer on disk. Update its access time.
|
||||
tracing::debug!(
|
||||
"Access time updated for layer {}: {} -> {}",
|
||||
layer.name,
|
||||
strftime(&on_disk.access_time),
|
||||
strftime(&layer.access_time)
|
||||
);
|
||||
touched.push(layer);
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
tracing::debug!("Layer {} not present on disk yet", layer.name);
|
||||
}
|
||||
|
||||
// Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
|
||||
// recently than it was evicted.
|
||||
if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
|
||||
if &layer.access_time > evicted_at {
|
||||
tracing::info!(
|
||||
"Re-downloading evicted layer {}, accessed at {}, evicted at {}",
|
||||
layer.name,
|
||||
strftime(&layer.access_time),
|
||||
strftime(evicted_at)
|
||||
);
|
||||
} else {
|
||||
tracing::trace!(
|
||||
"Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
|
||||
layer.name,
|
||||
strftime(&layer.access_time),
|
||||
strftime(evicted_at)
|
||||
);
|
||||
match self.layer_action(&timeline_state, &layer).await {
|
||||
LayerAction::Download => (),
|
||||
LayerAction::NoAction => continue,
|
||||
LayerAction::Skip => {
|
||||
self.skip_layer(layer);
|
||||
continue;
|
||||
}
|
||||
LayerAction::Touch => {
|
||||
touched.push(layer);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
match self
|
||||
@@ -1091,6 +1046,86 @@ impl<'a> TenantDownloader<'a> {
|
||||
(Ok(()), touched)
|
||||
}
|
||||
|
||||
async fn layer_action(
|
||||
&self,
|
||||
timeline_state: &SecondaryDetailTimeline,
|
||||
layer: &HeatMapLayer,
|
||||
) -> LayerAction {
|
||||
// Existing on-disk layers: just update their access time.
|
||||
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
|
||||
tracing::debug!("Layer {} is already on disk", layer.name);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
|
||||
// are already present on disk are really there.
|
||||
match tokio::fs::metadata(&on_disk.local_path).await {
|
||||
Ok(meta) => {
|
||||
tracing::debug!(
|
||||
"Layer {} present at {}, size {}",
|
||||
layer.name,
|
||||
on_disk.local_path,
|
||||
meta.len(),
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Layer {} not found at {} ({})",
|
||||
layer.name,
|
||||
on_disk.local_path,
|
||||
e
|
||||
);
|
||||
debug_assert!(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if on_disk.metadata.generation_file_size() != on_disk.metadata.generation_file_size() {
|
||||
tracing::info!(
|
||||
"Re-downloading layer {} with changed size or generation: {:?}->{:?}",
|
||||
layer.name,
|
||||
on_disk.metadata.generation_file_size(),
|
||||
on_disk.metadata.generation_file_size()
|
||||
);
|
||||
return LayerAction::Download;
|
||||
}
|
||||
if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
|
||||
// We already have this layer on disk. Update its access time.
|
||||
tracing::debug!(
|
||||
"Access time updated for layer {}: {} -> {}",
|
||||
layer.name,
|
||||
strftime(&on_disk.access_time),
|
||||
strftime(&layer.access_time)
|
||||
);
|
||||
return LayerAction::Touch;
|
||||
}
|
||||
return LayerAction::NoAction;
|
||||
} else {
|
||||
tracing::debug!("Layer {} not present on disk yet", layer.name);
|
||||
}
|
||||
|
||||
// Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
|
||||
// recently than it was evicted.
|
||||
if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
|
||||
if &layer.access_time > evicted_at {
|
||||
tracing::info!(
|
||||
"Re-downloading evicted layer {}, accessed at {}, evicted at {}",
|
||||
layer.name,
|
||||
strftime(&layer.access_time),
|
||||
strftime(evicted_at)
|
||||
);
|
||||
} else {
|
||||
tracing::trace!(
|
||||
"Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
|
||||
layer.name,
|
||||
strftime(&layer.access_time),
|
||||
strftime(evicted_at)
|
||||
);
|
||||
return LayerAction::Skip;
|
||||
}
|
||||
}
|
||||
LayerAction::Download
|
||||
}
|
||||
|
||||
async fn download_timeline(
|
||||
&self,
|
||||
timeline: HeatMapTimeline,
|
||||
|
||||
@@ -10,21 +10,30 @@ mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use bytes::Bytes;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::future::Future;
|
||||
use std::ops::Range;
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
use tracing::{trace, Instrument};
|
||||
use utils::sync::gate::GateGuard;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter};
|
||||
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
|
||||
pub use image_layer::{ImageLayer, ImageLayerWriter};
|
||||
pub use inmemory_layer::InMemoryLayer;
|
||||
@@ -78,30 +87,151 @@ pub(crate) enum ValueReconstructSituation {
|
||||
Continue,
|
||||
}
|
||||
|
||||
/// Reconstruct data accumulated for a single key during a vectored get
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub(crate) struct VectoredValueReconstructState {
|
||||
pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pub(crate) img: Option<(Lsn, Bytes)>,
|
||||
|
||||
situation: ValueReconstructSituation,
|
||||
/// On disk representation of a value loaded in a buffer
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum OnDiskValue {
|
||||
/// Unencoded [`Value::Image`]
|
||||
RawImage(Bytes),
|
||||
/// Encoded [`Value`]. Can deserialize into an image or a WAL record
|
||||
WalRecordOrImage(Bytes),
|
||||
}
|
||||
|
||||
impl VectoredValueReconstructState {
|
||||
fn get_cached_lsn(&self) -> Option<Lsn> {
|
||||
self.img.as_ref().map(|img| img.0)
|
||||
/// Reconstruct data accumulated for a single key during a vectored get
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct VectoredValueReconstructState {
|
||||
pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>,
|
||||
|
||||
pub(crate) situation: ValueReconstructSituation,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct OnDiskValueIoWaiter {
|
||||
rx: tokio::sync::oneshot::Receiver<OnDiskValueIoResult>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[must_use]
|
||||
pub(crate) enum OnDiskValueIo {
|
||||
/// Traversal identified this IO as required to complete the vectored get.
|
||||
Required {
|
||||
num_active_ios: Arc<AtomicUsize>,
|
||||
tx: tokio::sync::oneshot::Sender<OnDiskValueIoResult>,
|
||||
},
|
||||
/// Sparse keyspace reads always read all the values for a given key,
|
||||
/// even though only the first value is needed.
|
||||
///
|
||||
/// This variant represents the unnecessary IOs for those values at lower LSNs
|
||||
/// that aren't needed, but are currently still being done.
|
||||
///
|
||||
/// The execution of unnecessary IOs was a pre-existing behavior before concurrent IO.
|
||||
/// We added this explicit representation here so that we can drop
|
||||
/// unnecessary IO results immediately, instead of buffering them in
|
||||
/// `oneshot` channels inside [`VectoredValueReconstructState`] until
|
||||
/// [`VectoredValueReconstructState::collect_pending_ios`] gets called.
|
||||
Unnecessary,
|
||||
}
|
||||
|
||||
type OnDiskValueIoResult = Result<OnDiskValue, std::io::Error>;
|
||||
|
||||
impl OnDiskValueIo {
|
||||
pub(crate) fn complete(self, res: OnDiskValueIoResult) {
|
||||
match self {
|
||||
OnDiskValueIo::Required { num_active_ios, tx } => {
|
||||
num_active_ios.fetch_sub(1, std::sync::atomic::Ordering::Release);
|
||||
let _ = tx.send(res);
|
||||
}
|
||||
OnDiskValueIo::Unnecessary => {
|
||||
// Nobody cared, see variant doc comment.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<VectoredValueReconstructState> for ValueReconstructState {
|
||||
fn from(mut state: VectoredValueReconstructState) -> Self {
|
||||
// walredo expects the records to be descending in terms of Lsn
|
||||
state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum WaitCompletionError {
|
||||
#[error("OnDiskValueIo was dropped without completing, likely the sidecar task panicked")]
|
||||
IoDropped,
|
||||
}
|
||||
|
||||
ValueReconstructState {
|
||||
records: state.records,
|
||||
img: state.img,
|
||||
impl OnDiskValueIoWaiter {
|
||||
pub(crate) async fn wait_completion(self) -> Result<OnDiskValueIoResult, WaitCompletionError> {
|
||||
// NB: for Unnecessary IOs, this method never gets called because we don't add them to `on_disk_values`.
|
||||
self.rx.await.map_err(|_| WaitCompletionError::IoDropped)
|
||||
}
|
||||
}
|
||||
|
||||
impl VectoredValueReconstructState {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Technically fine to stop polling this future, but, the IOs will still
|
||||
/// be executed to completion by the sidecar task and hold on to / consume resources.
|
||||
/// Better not do it to make reasonsing about the system easier.
|
||||
pub(crate) async fn collect_pending_ios(
|
||||
self,
|
||||
) -> Result<ValueReconstructState, PageReconstructError> {
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
let mut res = Ok(ValueReconstructState::default());
|
||||
|
||||
// We should try hard not to bail early, so that by the time we return from this
|
||||
// function, all IO for this value is done. It's not required -- we could totally
|
||||
// stop polling the IO futures in the sidecar task, they need to support that,
|
||||
// but just stopping to poll doesn't reduce the IO load on the disk. It's easier
|
||||
// to reason about the system if we just wait for all IO to complete, even if
|
||||
// we're no longer interested in the result.
|
||||
//
|
||||
// Revisit this when IO futures are replaced with a more sophisticated IO system
|
||||
// and an IO scheduler, where we know which IOs were submitted and which ones
|
||||
// just queued. Cf the comment on IoConcurrency::spawn_io.
|
||||
for (lsn, waiter) in self.on_disk_values {
|
||||
let value_recv_res = waiter
|
||||
.wait_completion()
|
||||
// we rely on the caller to poll us to completion, so this is not a bail point
|
||||
.await;
|
||||
// Force not bailing early by wrapping the code into a closure.
|
||||
#[allow(clippy::redundant_closure_call)]
|
||||
let _: () = (|| {
|
||||
match (&mut res, value_recv_res) {
|
||||
(Err(_), _) => {
|
||||
// We've already failed, no need to process more.
|
||||
}
|
||||
(Ok(_), Err(wait_err)) => {
|
||||
// This shouldn't happen - likely the sidecar task panicked.
|
||||
res = Err(PageReconstructError::Other(wait_err.into()));
|
||||
}
|
||||
(Ok(_), Ok(Err(err))) => {
|
||||
let err: std::io::Error = err;
|
||||
// TODO: returning IO error here will fail a compute query.
|
||||
// Probably not what we want, we're not doing `maybe_fatal_err`
|
||||
// in the IO futures.
|
||||
// But it's been like that for a long time, not changing it
|
||||
// as part of concurrent IO.
|
||||
// => https://github.com/neondatabase/neon/issues/10454
|
||||
res = Err(PageReconstructError::Other(err.into()));
|
||||
}
|
||||
(Ok(ok), Ok(Ok(OnDiskValue::RawImage(img)))) => {
|
||||
assert!(ok.img.is_none());
|
||||
ok.img = Some((lsn, img));
|
||||
}
|
||||
(Ok(ok), Ok(Ok(OnDiskValue::WalRecordOrImage(buf)))) => {
|
||||
match Value::des(&buf) {
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
ok.records.push((lsn, rec));
|
||||
}
|
||||
Ok(Value::Image(img)) => {
|
||||
assert!(ok.img.is_none());
|
||||
ok.img = Some((lsn, img));
|
||||
}
|
||||
Err(err) => {
|
||||
res = Err(PageReconstructError::Other(err.into()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,7 +239,7 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
|
||||
pub(crate) struct ValuesReconstructState {
|
||||
/// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
|
||||
/// should not expect to get anything from this hashmap.
|
||||
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
|
||||
pub(crate) keys: HashMap<Key, VectoredValueReconstructState>,
|
||||
/// The keys which are already retrieved
|
||||
keys_done: KeySpaceRandomAccum,
|
||||
|
||||
@@ -119,27 +249,365 @@ pub(crate) struct ValuesReconstructState {
|
||||
// Statistics that are still accessible as a caller of `get_vectored_impl`.
|
||||
layers_visited: u32,
|
||||
delta_layers_visited: u32,
|
||||
|
||||
pub(crate) io_concurrency: IoConcurrency,
|
||||
num_active_ios: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
/// The level of IO concurrency to be used on the read path
|
||||
///
|
||||
/// The desired end state is that we always do parallel IO.
|
||||
/// This struct and the dispatching in the impl will be removed once
|
||||
/// we've built enough confidence.
|
||||
pub(crate) enum IoConcurrency {
|
||||
Sequential,
|
||||
SidecarTask {
|
||||
task_id: usize,
|
||||
ios_tx: tokio::sync::mpsc::UnboundedSender<IoFuture>,
|
||||
},
|
||||
}
|
||||
|
||||
type IoFuture = Pin<Box<dyn Send + Future<Output = ()>>>;
|
||||
|
||||
pub(crate) enum SelectedIoConcurrency {
|
||||
Sequential,
|
||||
SidecarTask(GateGuard),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for IoConcurrency {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
IoConcurrency::Sequential => write!(f, "Sequential"),
|
||||
IoConcurrency::SidecarTask { .. } => write!(f, "SidecarTask"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for SelectedIoConcurrency {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
SelectedIoConcurrency::Sequential => write!(f, "Sequential"),
|
||||
SelectedIoConcurrency::SidecarTask(_) => write!(f, "SidecarTask"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IoConcurrency {
|
||||
/// Force sequential IO. This is a temporary workaround until we have
|
||||
/// moved plumbing-through-the-call-stack
|
||||
/// of IoConcurrency into `RequestContextq.
|
||||
///
|
||||
/// DO NOT USE for new code.
|
||||
///
|
||||
/// Tracking issue: <https://github.com/neondatabase/neon/issues/10460>.
|
||||
pub(crate) fn sequential() -> Self {
|
||||
Self::spawn(SelectedIoConcurrency::Sequential)
|
||||
}
|
||||
|
||||
pub(crate) fn spawn_from_conf(
|
||||
conf: &'static PageServerConf,
|
||||
gate_guard: GateGuard,
|
||||
) -> IoConcurrency {
|
||||
use pageserver_api::config::GetVectoredConcurrentIo;
|
||||
let selected = match conf.get_vectored_concurrent_io {
|
||||
GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
|
||||
GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
|
||||
};
|
||||
Self::spawn(selected)
|
||||
}
|
||||
|
||||
pub(crate) fn spawn(io_concurrency: SelectedIoConcurrency) -> Self {
|
||||
match io_concurrency {
|
||||
SelectedIoConcurrency::Sequential => IoConcurrency::Sequential,
|
||||
SelectedIoConcurrency::SidecarTask(gate_guard) => {
|
||||
let (ios_tx, ios_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
static TASK_ID: AtomicUsize = AtomicUsize::new(0);
|
||||
let task_id = TASK_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
// TODO: enrich the span with more context (tenant,shard,timeline) + (basebackup|pagestream|...)
|
||||
let span =
|
||||
tracing::info_span!(parent: None, "IoConcurrency_sidecar", task_id = task_id);
|
||||
trace!(task_id, "spawning sidecar task");
|
||||
tokio::spawn(async move {
|
||||
trace!("start");
|
||||
scopeguard::defer!{ trace!("end") };
|
||||
type IosRx = tokio::sync::mpsc::UnboundedReceiver<IoFuture>;
|
||||
enum State {
|
||||
Waiting {
|
||||
// invariant: is_empty(), but we recycle the allocation
|
||||
empty_futures: FuturesUnordered<IoFuture>,
|
||||
ios_rx: IosRx,
|
||||
},
|
||||
Executing {
|
||||
futures: FuturesUnordered<IoFuture>,
|
||||
ios_rx: IosRx,
|
||||
},
|
||||
ShuttingDown {
|
||||
futures: FuturesUnordered<IoFuture>,
|
||||
},
|
||||
}
|
||||
let mut state = State::Waiting {
|
||||
empty_futures: FuturesUnordered::new(),
|
||||
ios_rx,
|
||||
};
|
||||
loop {
|
||||
match state {
|
||||
State::Waiting {
|
||||
empty_futures,
|
||||
mut ios_rx,
|
||||
} => {
|
||||
assert!(empty_futures.is_empty());
|
||||
tokio::select! {
|
||||
fut = ios_rx.recv() => {
|
||||
if let Some(fut) = fut {
|
||||
trace!("received new io future");
|
||||
empty_futures.push(fut);
|
||||
state = State::Executing { futures: empty_futures, ios_rx };
|
||||
} else {
|
||||
state = State::ShuttingDown { futures: empty_futures }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
State::Executing {
|
||||
mut futures,
|
||||
mut ios_rx,
|
||||
} => {
|
||||
tokio::select! {
|
||||
res = futures.next() => {
|
||||
trace!("io future completed");
|
||||
assert!(res.is_some());
|
||||
if futures.is_empty() {
|
||||
state = State::Waiting { empty_futures: futures, ios_rx};
|
||||
} else {
|
||||
state = State::Executing { futures, ios_rx };
|
||||
}
|
||||
}
|
||||
fut = ios_rx.recv() => {
|
||||
if let Some(fut) = fut {
|
||||
trace!("received new io future");
|
||||
futures.push(fut);
|
||||
state = State::Executing { futures, ios_rx};
|
||||
} else {
|
||||
state = State::ShuttingDown { futures };
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
State::ShuttingDown {
|
||||
mut futures,
|
||||
} => {
|
||||
trace!("shutting down");
|
||||
while let Some(()) = futures.next().await {
|
||||
trace!("io future completed (shutdown)");
|
||||
// drain
|
||||
}
|
||||
trace!("shutdown complete");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
drop(gate_guard); // drop it right before we exit
|
||||
}.instrument(span));
|
||||
IoConcurrency::SidecarTask { task_id, ios_tx }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clone(&self) -> Self {
|
||||
match self {
|
||||
IoConcurrency::Sequential => IoConcurrency::Sequential,
|
||||
IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask {
|
||||
task_id: *task_id,
|
||||
ios_tx: ios_tx.clone(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string.
|
||||
///
|
||||
/// The IO is represented as an opaque future.
|
||||
/// IO completion must be handled inside the future, e.g., through a oneshot channel.
|
||||
///
|
||||
/// The API seems simple but there are multiple **pitfalls** involving
|
||||
/// DEADLOCK RISK.
|
||||
///
|
||||
/// First, there are no guarantees about the exexecution of the IO.
|
||||
/// It may be `await`ed in-place before this function returns.
|
||||
/// It may be polled partially by this task and handed off to another task to be finished.
|
||||
/// It may be polled and then dropped before returning ready.
|
||||
///
|
||||
/// This means that submitted IOs must not be interedependent.
|
||||
/// Interdependence may be through shared limited resources, e.g.,
|
||||
/// - VirtualFile file descriptor cache slot acquisition
|
||||
/// - tokio-epoll-uring slot
|
||||
///
|
||||
/// # Why current usage is safe from deadlocks
|
||||
///
|
||||
/// Textbook condition for a deadlock is that _all_ of the following be given
|
||||
/// - Mutual exclusion
|
||||
/// - Hold and wait
|
||||
/// - No preemption
|
||||
/// - Circular wait
|
||||
///
|
||||
/// The current usage is safe because:
|
||||
/// - Mutual exclusion: IO futures definitely use mutexes, no way around that for now
|
||||
/// - Hold and wait: IO futures currently hold two kinds of locks/resources while waiting
|
||||
/// for acquisition of other resources:
|
||||
/// - VirtualFile file descriptor cache slot tokio mutex
|
||||
/// - tokio-epoll-uring slot (uses tokio notify => wait queue, much like mutex)
|
||||
/// - No preemption: there's no taking-away of acquired locks/resources => given
|
||||
/// - Circular wait: this is the part of the condition that isn't met: all IO futures
|
||||
/// first acquire VirtualFile mutex, then tokio-epoll-uring slot.
|
||||
/// There is no IO future that acquires slot before VirtualFile.
|
||||
/// Hence there can be no circular waiting.
|
||||
/// Hence there cannot be a deadlock.
|
||||
///
|
||||
/// This is a very fragile situation and must be revisited whenver any code called from
|
||||
/// inside the IO futures is changed.
|
||||
///
|
||||
/// We will move away from opaque IO futures towards well-defined IOs at some point in
|
||||
/// the future when we have shipped this first version of concurrent IO to production
|
||||
/// and are ready to retire the Sequential mode which runs the futures in place.
|
||||
/// Right now, while brittle, the opaque IO approach allows us to ship the feature
|
||||
/// with minimal changes to the code and minimal changes to existing behavior in Sequential mode.
|
||||
///
|
||||
/// Also read the comment in `collect_pending_ios`.
|
||||
pub(crate) async fn spawn_io<F>(&mut self, fut: F)
|
||||
where
|
||||
F: std::future::Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
match self {
|
||||
IoConcurrency::Sequential => fut.await,
|
||||
IoConcurrency::SidecarTask { ios_tx, .. } => {
|
||||
let fut = Box::pin(fut);
|
||||
// NB: experiments showed that doing an opportunistic poll of `fut` here was bad for throughput
|
||||
// while insignificant for latency.
|
||||
// It would make sense to revisit the tokio-epoll-uring API in the future such that we can try
|
||||
// a submission here, but never poll the future. That way, io_uring can make proccess while
|
||||
// the future sits in the ios_tx queue.
|
||||
match ios_tx.send(fut) {
|
||||
Ok(()) => {}
|
||||
Err(_) => {
|
||||
unreachable!("the io task must have exited, likely it panicked")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn spawn_for_test() -> impl std::ops::DerefMut<Target = Self> {
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use tracing::info;
|
||||
use utils::sync::gate::Gate;
|
||||
|
||||
// Spawn needs a Gate, give it one.
|
||||
struct Wrapper {
|
||||
inner: IoConcurrency,
|
||||
#[allow(dead_code)]
|
||||
gate: Box<Gate>,
|
||||
}
|
||||
impl Deref for Wrapper {
|
||||
type Target = IoConcurrency;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.inner
|
||||
}
|
||||
}
|
||||
impl DerefMut for Wrapper {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.inner
|
||||
}
|
||||
}
|
||||
let gate = Box::new(Gate::default());
|
||||
|
||||
// The default behavior when running Rust unit tests without any further
|
||||
// flags is to use the new behavior.
|
||||
// The CI uses the following environment variable to unit test both old
|
||||
// and new behavior.
|
||||
// NB: the Python regression & perf tests take the `else` branch
|
||||
// below and have their own defaults management.
|
||||
let selected = {
|
||||
// The pageserver_api::config type is unsuitable because it's internally tagged.
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
enum TestOverride {
|
||||
Sequential,
|
||||
SidecarTask,
|
||||
}
|
||||
use once_cell::sync::Lazy;
|
||||
static TEST_OVERRIDE: Lazy<TestOverride> = Lazy::new(|| {
|
||||
utils::env::var_serde_json_string(
|
||||
"NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO",
|
||||
)
|
||||
.unwrap_or(TestOverride::SidecarTask)
|
||||
});
|
||||
|
||||
match *TEST_OVERRIDE {
|
||||
TestOverride::Sequential => SelectedIoConcurrency::Sequential,
|
||||
TestOverride::SidecarTask => {
|
||||
SelectedIoConcurrency::SidecarTask(gate.enter().expect("just created it"))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
info!(?selected, "get_vectored_concurrent_io test");
|
||||
|
||||
Wrapper {
|
||||
inner: Self::spawn(selected),
|
||||
gate,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Make noise in case the [`ValuesReconstructState`] gets dropped while
|
||||
/// there are still IOs in flight.
|
||||
/// Refer to `collect_pending_ios` for why we prefer not to do that.
|
||||
//
|
||||
/// We log from here instead of from the sidecar task because the [`ValuesReconstructState`]
|
||||
/// gets dropped in a tracing span with more context.
|
||||
/// We repeat the sidecar tasks's `task_id` so we can correlate what we emit here with
|
||||
/// the logs / panic handler logs from the sidecar task, which also logs the `task_id`.
|
||||
impl Drop for ValuesReconstructState {
|
||||
fn drop(&mut self) {
|
||||
let num_active_ios = self
|
||||
.num_active_ios
|
||||
.load(std::sync::atomic::Ordering::Acquire);
|
||||
if num_active_ios == 0 {
|
||||
return;
|
||||
}
|
||||
let sidecar_task_id = match &self.io_concurrency {
|
||||
IoConcurrency::Sequential => None,
|
||||
IoConcurrency::SidecarTask { task_id, .. } => Some(*task_id),
|
||||
};
|
||||
tracing::warn!(
|
||||
num_active_ios,
|
||||
?sidecar_task_id,
|
||||
backtrace=%std::backtrace::Backtrace::force_capture(),
|
||||
"dropping ValuesReconstructState while some IOs have not been completed",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
impl ValuesReconstructState {
|
||||
pub(crate) fn new() -> Self {
|
||||
pub(crate) fn new(io_concurrency: IoConcurrency) -> Self {
|
||||
Self {
|
||||
keys: HashMap::new(),
|
||||
keys_done: KeySpaceRandomAccum::new(),
|
||||
keys_with_image_coverage: None,
|
||||
layers_visited: 0,
|
||||
delta_layers_visited: 0,
|
||||
io_concurrency,
|
||||
num_active_ios: Arc::new(AtomicUsize::new(0)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Associate a key with the error which it encountered and mark it as done
|
||||
pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
|
||||
let previous = self.keys.insert(key, Err(err));
|
||||
if let Some(Ok(state)) = previous {
|
||||
if state.situation == ValueReconstructSituation::Continue {
|
||||
self.keys_done.add_key(key);
|
||||
}
|
||||
}
|
||||
/// Absolutely read [`IoConcurrency::spawn_io`] to learn about assumptions & pitfalls.
|
||||
pub(crate) async fn spawn_io<F>(&mut self, fut: F)
|
||||
where
|
||||
F: std::future::Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
self.io_concurrency.spawn_io(fut).await;
|
||||
}
|
||||
|
||||
pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
|
||||
@@ -159,29 +627,6 @@ impl ValuesReconstructState {
|
||||
self.layers_visited
|
||||
}
|
||||
|
||||
/// This function is called after reading a keyspace from a layer.
|
||||
/// It checks if the read path has now moved past the cached Lsn for any keys.
|
||||
///
|
||||
/// Implementation note: We intentionally iterate over the keys for which we've
|
||||
/// already collected some reconstruct data. This avoids scaling complexity with
|
||||
/// the size of the search space.
|
||||
pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) {
|
||||
for (key, value) in self.keys.iter_mut() {
|
||||
if !keyspace.contains(key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Ok(state) = value {
|
||||
if state.situation != ValueReconstructSituation::Complete
|
||||
&& state.get_cached_lsn() >= Some(advanced_to)
|
||||
{
|
||||
state.situation = ValueReconstructSituation::Complete;
|
||||
self.keys_done.add_key(*key);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// On hitting image layer, we can mark all keys in this range as done, because
|
||||
/// if the image layer does not contain a key, it is deleted/never added.
|
||||
pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
|
||||
@@ -199,70 +644,42 @@ impl ValuesReconstructState {
|
||||
///
|
||||
/// If the key is in the sparse keyspace (i.e., aux files), we do not track them in
|
||||
/// `key_done`.
|
||||
pub(crate) fn update_key(
|
||||
&mut self,
|
||||
key: &Key,
|
||||
lsn: Lsn,
|
||||
value: Value,
|
||||
) -> ValueReconstructSituation {
|
||||
let state = self
|
||||
.keys
|
||||
.entry(*key)
|
||||
.or_insert(Ok(VectoredValueReconstructState::default()));
|
||||
// TODO: rename this method & update description.
|
||||
pub(crate) fn update_key(&mut self, key: &Key, lsn: Lsn, completes: bool) -> OnDiskValueIo {
|
||||
let state = self.keys.entry(*key).or_default();
|
||||
|
||||
let is_sparse_key = key.is_sparse();
|
||||
if let Ok(state) = state {
|
||||
let key_done = match state.situation {
|
||||
ValueReconstructSituation::Complete => {
|
||||
if is_sparse_key {
|
||||
// Sparse keyspace might be visited multiple times because
|
||||
// we don't track unmapped keyspaces.
|
||||
return ValueReconstructSituation::Complete;
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
ValueReconstructSituation::Continue => match value {
|
||||
Value::Image(img) => {
|
||||
state.img = Some((lsn, img));
|
||||
true
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
debug_assert!(
|
||||
Some(lsn) > state.get_cached_lsn(),
|
||||
"Attempt to collect a record below cached LSN for walredo: {} < {}",
|
||||
lsn,
|
||||
state
|
||||
.get_cached_lsn()
|
||||
.expect("Assertion can only fire if a cached lsn is present")
|
||||
);
|
||||
|
||||
let will_init = rec.will_init();
|
||||
state.records.push((lsn, rec));
|
||||
will_init
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
if key_done && state.situation == ValueReconstructSituation::Continue {
|
||||
state.situation = ValueReconstructSituation::Complete;
|
||||
if !is_sparse_key {
|
||||
self.keys_done.add_key(*key);
|
||||
let required_io = match state.situation {
|
||||
ValueReconstructSituation::Complete => {
|
||||
if is_sparse_key {
|
||||
// Sparse keyspace might be visited multiple times because
|
||||
// we don't track unmapped keyspaces.
|
||||
return OnDiskValueIo::Unnecessary;
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
ValueReconstructSituation::Continue => {
|
||||
self.num_active_ios
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Release);
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
state.on_disk_values.push((lsn, OnDiskValueIoWaiter { rx }));
|
||||
OnDiskValueIo::Required {
|
||||
tx,
|
||||
num_active_ios: Arc::clone(&self.num_active_ios),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
state.situation
|
||||
} else {
|
||||
ValueReconstructSituation::Complete
|
||||
if completes && state.situation == ValueReconstructSituation::Continue {
|
||||
state.situation = ValueReconstructSituation::Complete;
|
||||
if !is_sparse_key {
|
||||
self.keys_done.add_key(*key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the Lsn at which this key is cached if one exists.
|
||||
/// The read path should go no further than this Lsn for the given key.
|
||||
pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
|
||||
self.keys
|
||||
.get(key)
|
||||
.and_then(|k| k.as_ref().ok())
|
||||
.and_then(|state| state.get_cached_lsn())
|
||||
required_io
|
||||
}
|
||||
|
||||
/// Returns the key space describing the keys that have
|
||||
@@ -276,12 +693,6 @@ impl ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ValuesReconstructState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// A key that uniquely identifies a layer in a timeline
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
pub(crate) enum LayerId {
|
||||
@@ -720,3 +1131,78 @@ impl<T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'_, T> {
|
||||
write!(f, "{}..{}", self.0.start, self.0.end)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests2 {
|
||||
use pageserver_api::key::DBDIR_KEY;
|
||||
use tracing::info;
|
||||
|
||||
use super::*;
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
|
||||
/// TODO: currently this test relies on manual visual inspection of the --no-capture output.
|
||||
/// Should look like so:
|
||||
/// ```text
|
||||
/// RUST_LOG=trace cargo nextest run --features testing --no-capture test_io_concurrency_noise
|
||||
/// running 1 test
|
||||
/// 2025-01-21T17:42:01.335679Z INFO get_vectored_concurrent_io test selected=SidecarTask
|
||||
/// 2025-01-21T17:42:01.335680Z TRACE spawning sidecar task task_id=0
|
||||
/// 2025-01-21T17:42:01.335937Z TRACE IoConcurrency_sidecar{task_id=0}: start
|
||||
/// 2025-01-21T17:42:01.335972Z TRACE IoConcurrency_sidecar{task_id=0}: received new io future
|
||||
/// 2025-01-21T17:42:01.335999Z INFO IoConcurrency_sidecar{task_id=0}: waiting for signal to complete IO
|
||||
/// 2025-01-21T17:42:01.336229Z WARN dropping ValuesReconstructState while some IOs have not been completed num_active_ios=1 sidecar_task_id=Some(0) backtrace= 0: <pageserver::tenant::storage_layer::ValuesReconstructState as core::ops::drop::Drop>::drop
|
||||
/// at ./src/tenant/storage_layer.rs:553:24
|
||||
/// 1: core::ptr::drop_in_place<pageserver::tenant::storage_layer::ValuesReconstructState>
|
||||
/// at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ptr/mod.rs:521:1
|
||||
/// 2: core::mem::drop
|
||||
/// at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/mem/mod.rs:942:24
|
||||
/// 3: pageserver::tenant::storage_layer::tests2::test_io_concurrency_noise::{{closure}}
|
||||
/// at ./src/tenant/storage_layer.rs:1159:9
|
||||
/// ...
|
||||
/// 49: <unknown>
|
||||
/// 2025-01-21T17:42:01.452293Z INFO IoConcurrency_sidecar{task_id=0}: completing IO
|
||||
/// 2025-01-21T17:42:01.452357Z TRACE IoConcurrency_sidecar{task_id=0}: io future completed
|
||||
/// 2025-01-21T17:42:01.452473Z TRACE IoConcurrency_sidecar{task_id=0}: end
|
||||
/// test tenant::storage_layer::tests2::test_io_concurrency_noise ... ok
|
||||
///
|
||||
/// ```
|
||||
#[tokio::test]
|
||||
async fn test_io_concurrency_noise() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
match *io_concurrency {
|
||||
IoConcurrency::Sequential => {
|
||||
// This test asserts behavior in sidecar mode, doesn't make sense in sequential mode.
|
||||
return;
|
||||
}
|
||||
IoConcurrency::SidecarTask { .. } => {}
|
||||
}
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
|
||||
|
||||
let (io_fut_is_waiting_tx, io_fut_is_waiting) = tokio::sync::oneshot::channel();
|
||||
let (do_complete_io, should_complete_io) = tokio::sync::oneshot::channel();
|
||||
let (io_fut_exiting_tx, io_fut_exiting) = tokio::sync::oneshot::channel();
|
||||
|
||||
let io = reconstruct_state.update_key(&DBDIR_KEY, Lsn(8), true);
|
||||
reconstruct_state
|
||||
.spawn_io(async move {
|
||||
info!("waiting for signal to complete IO");
|
||||
io_fut_is_waiting_tx.send(()).unwrap();
|
||||
should_complete_io.await.unwrap();
|
||||
info!("completing IO");
|
||||
io.complete(Ok(OnDiskValue::RawImage(Bytes::new())));
|
||||
io_fut_exiting_tx.send(()).unwrap();
|
||||
})
|
||||
.await;
|
||||
|
||||
io_fut_is_waiting.await.unwrap();
|
||||
|
||||
// this is what makes the noise
|
||||
drop(reconstruct_state);
|
||||
|
||||
do_complete_io.send(()).unwrap();
|
||||
|
||||
io_fut_exiting.await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,6 +87,23 @@ impl BatchLayerWriter {
|
||||
));
|
||||
}
|
||||
|
||||
pub(crate) async fn finish(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<ResidentLayer>> {
|
||||
let res = self
|
||||
.finish_with_discard_fn(tline, ctx, |_| async { false })
|
||||
.await?;
|
||||
let mut output = Vec::new();
|
||||
for r in res {
|
||||
if let BatchWriterResult::Produced(layer) = r {
|
||||
output.push(layer);
|
||||
}
|
||||
}
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
pub(crate) async fn finish_with_discard_fn<D, F>(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
|
||||
@@ -41,13 +41,12 @@ use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::IoBufferMut;
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
@@ -60,7 +59,7 @@ use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
@@ -77,7 +76,10 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
|
||||
use super::{
|
||||
AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -226,7 +228,7 @@ pub struct DeltaLayerInner {
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
|
||||
file: VirtualFile,
|
||||
file: Arc<VirtualFile>,
|
||||
file_id: FileId,
|
||||
|
||||
layer_key_range: Range<Key>,
|
||||
@@ -795,9 +797,11 @@ impl DeltaLayerInner {
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open_v2(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?;
|
||||
let file = Arc::new(
|
||||
VirtualFile::open_v2(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?,
|
||||
);
|
||||
|
||||
let file_id = page_cache::next_file_id();
|
||||
|
||||
@@ -842,12 +846,11 @@ impl DeltaLayerInner {
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
//
|
||||
// If the key is cached, go no further than the cached Lsn.
|
||||
//
|
||||
// Currently, the index is visited for each range, but this
|
||||
// can be further optimised to visit the index only once.
|
||||
pub(super) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
this: ResidentLayer,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
@@ -875,17 +878,14 @@ impl DeltaLayerInner {
|
||||
data_end_offset,
|
||||
index_reader,
|
||||
planner,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
|
||||
self.do_reads_and_update_state(reads, reconstruct_state, ctx)
|
||||
self.do_reads_and_update_state(this, reads, reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -895,7 +895,6 @@ impl DeltaLayerInner {
|
||||
data_end_offset: u64,
|
||||
index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
|
||||
mut planner: VectoredReadPlanner,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>>
|
||||
where
|
||||
@@ -922,10 +921,9 @@ impl DeltaLayerInner {
|
||||
assert!(key >= range.start);
|
||||
|
||||
let outside_lsn_range = !lsn_range.contains(&lsn);
|
||||
let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn);
|
||||
|
||||
let flag = {
|
||||
if outside_lsn_range || below_cached_lsn {
|
||||
if outside_lsn_range {
|
||||
BlobFlag::Ignore
|
||||
} else if blob_ref.will_init() {
|
||||
BlobFlag::ReplaceAll
|
||||
@@ -994,98 +992,78 @@ impl DeltaLayerInner {
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
this: ResidentLayer,
|
||||
reads: Vec<VectoredRead>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
let mut ignore_key_with_err = None;
|
||||
|
||||
let max_vectored_read_bytes = self
|
||||
.max_vectored_read_bytes
|
||||
.expect("Layer is loaded with max vectored bytes config")
|
||||
.0
|
||||
.into();
|
||||
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
|
||||
let mut buf = Some(IoBufferMut::with_capacity(buf_size));
|
||||
|
||||
// Note that reads are processed in reverse order (from highest key+lsn).
|
||||
// This is the order that `ReconstructState` requires such that it can
|
||||
// track when a key is done.
|
||||
for read in reads.into_iter().rev() {
|
||||
let res = vectored_blob_reader
|
||||
.read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
|
||||
.await;
|
||||
|
||||
let blobs_buf = match res {
|
||||
Ok(blobs_buf) => blobs_buf,
|
||||
Err(err) => {
|
||||
let kind = err.kind();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice() {
|
||||
reconstruct_state.on_key_error(
|
||||
blob_meta.key,
|
||||
PageReconstructError::Other(anyhow!(
|
||||
"Failed to read blobs from virtual file {}: {}",
|
||||
self.file.path(),
|
||||
kind
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
// We have "lost" the buffer since the lower level IO api
|
||||
// doesn't return the buffer on error. Allocate a new one.
|
||||
buf = Some(IoBufferMut::with_capacity(buf_size));
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter().rev() {
|
||||
if Some(meta.meta.key) == ignore_key_with_err {
|
||||
continue;
|
||||
}
|
||||
let blob_read = meta.read(&view).await;
|
||||
let blob_read = match blob_read {
|
||||
Ok(buf) => buf,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to decompress blob from virtual file {}",
|
||||
self.file.path(),
|
||||
))),
|
||||
);
|
||||
|
||||
ignore_key_with_err = Some(meta.meta.key);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let value = Value::des(&blob_read);
|
||||
|
||||
let value = match value {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to deserialize blob from virtual file {}",
|
||||
self.file.path(),
|
||||
))),
|
||||
);
|
||||
|
||||
ignore_key_with_err = Some(meta.meta.key);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
|
||||
// state, no further updates shall be made to it. The call below will
|
||||
// panic if the invariant is violated.
|
||||
reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
|
||||
let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice().iter().rev() {
|
||||
let io = reconstruct_state.update_key(
|
||||
&blob_meta.key,
|
||||
blob_meta.lsn,
|
||||
blob_meta.will_init,
|
||||
);
|
||||
ios.insert((blob_meta.key, blob_meta.lsn), io);
|
||||
}
|
||||
|
||||
buf = Some(blobs_buf.buf);
|
||||
let read_extend_residency = this.clone();
|
||||
let read_from = self.file.clone();
|
||||
let read_ctx = ctx.attached_child();
|
||||
reconstruct_state
|
||||
.spawn_io(async move {
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&read_from);
|
||||
let buf = IoBufferMut::with_capacity(buf_size);
|
||||
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter().rev() {
|
||||
let io = ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap();
|
||||
|
||||
let blob_read = meta.read(&view).await;
|
||||
let blob_read = match blob_read {
|
||||
Ok(buf) => buf,
|
||||
Err(e) => {
|
||||
io.complete(Err(e));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
io.complete(Ok(OnDiskValue::WalRecordOrImage(
|
||||
blob_read.into_bytes(),
|
||||
)));
|
||||
}
|
||||
|
||||
assert!(ios.is_empty());
|
||||
}
|
||||
Err(err) => {
|
||||
for (_, sender) in ios {
|
||||
sender.complete(Err(std::io::Error::new(
|
||||
err.kind(),
|
||||
"vec read failed",
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// keep layer resident until this IO is done; this spawned IO future generally outlives the
|
||||
// call to `self` / the `Arc<DownloadedLayer>` / the `ResidentLayer` that guarantees residency
|
||||
drop(read_extend_residency);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1224,7 +1202,14 @@ impl DeltaLayerInner {
|
||||
let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
|
||||
let end_offset = offset;
|
||||
|
||||
Some((BlobMeta { key, lsn }, start_offset..end_offset))
|
||||
Some((
|
||||
BlobMeta {
|
||||
key,
|
||||
lsn,
|
||||
will_init: false,
|
||||
},
|
||||
start_offset..end_offset,
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -1560,7 +1545,9 @@ impl DeltaLayerIterator<'_> {
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
|
||||
let blob_ref = BlobRef(value);
|
||||
let offset = blob_ref.pos();
|
||||
if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
|
||||
if let Some(batch_plan) =
|
||||
self.planner.handle(key, lsn, offset, blob_ref.will_init())
|
||||
{
|
||||
break batch_plan;
|
||||
}
|
||||
} else {
|
||||
@@ -1673,7 +1660,6 @@ pub(crate) mod test {
|
||||
.expect("In memory disk finish should never fail");
|
||||
let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk);
|
||||
let planner = VectoredReadPlanner::new(100);
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
|
||||
let keyspace = KeySpace {
|
||||
@@ -1691,7 +1677,6 @@ pub(crate) mod test {
|
||||
disk_offset,
|
||||
reader,
|
||||
planner,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1935,7 +1920,6 @@ pub(crate) mod test {
|
||||
);
|
||||
|
||||
let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
|
||||
let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
|
||||
@@ -1945,7 +1929,6 @@ pub(crate) mod test {
|
||||
data_end_offset,
|
||||
index_reader,
|
||||
planner,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -38,12 +38,11 @@ use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::virtual_file::IoBufferMut;
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
@@ -56,12 +55,13 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_stream::StreamExt;
|
||||
use tracing::*;
|
||||
@@ -73,7 +73,10 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::layer_name::ImageLayerName;
|
||||
use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
|
||||
use super::{
|
||||
AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -164,7 +167,7 @@ pub struct ImageLayerInner {
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
file: VirtualFile,
|
||||
file: Arc<VirtualFile>,
|
||||
file_id: FileId,
|
||||
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
@@ -391,9 +394,11 @@ impl ImageLayerInner {
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open_v2(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?;
|
||||
let file = Arc::new(
|
||||
VirtualFile::open_v2(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?,
|
||||
);
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = block_reader
|
||||
@@ -439,6 +444,7 @@ impl ImageLayerInner {
|
||||
// the reconstruct state with whatever is found.
|
||||
pub(super) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
this: ResidentLayer,
|
||||
keyspace: KeySpace,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
@@ -448,7 +454,7 @@ impl ImageLayerInner {
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
|
||||
self.do_reads_and_update_state(reads, reconstruct_state, ctx)
|
||||
self.do_reads_and_update_state(this, reads, reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
reconstruct_state.on_image_layer_visited(&self.key_range);
|
||||
@@ -570,6 +576,7 @@ impl ImageLayerInner {
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
this: ResidentLayer,
|
||||
reads: Vec<VectoredRead>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
@@ -580,8 +587,13 @@ impl ImageLayerInner {
|
||||
.0
|
||||
.into();
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
for read in reads.into_iter() {
|
||||
let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice() {
|
||||
let io = reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true);
|
||||
ios.insert((blob_meta.key, blob_meta.lsn), io);
|
||||
}
|
||||
|
||||
let buf_size = read.size();
|
||||
|
||||
if buf_size > max_vectored_read_bytes {
|
||||
@@ -611,50 +623,51 @@ impl ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
let buf = IoBufferMut::with_capacity(buf_size);
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
|
||||
let read_extend_residency = this.clone();
|
||||
let read_from = self.file.clone();
|
||||
let read_ctx = ctx.attached_child();
|
||||
reconstruct_state
|
||||
.spawn_io(async move {
|
||||
let buf = IoBufferMut::with_capacity(buf_size);
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&read_from);
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
|
||||
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = meta.read(&view).await;
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let io: OnDiskValueIo =
|
||||
ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap();
|
||||
let img_buf = meta.read(&view).await;
|
||||
|
||||
let img_buf = match img_buf {
|
||||
Ok(img_buf) => img_buf,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to decompress blob from virtual file {}",
|
||||
self.file.path(),
|
||||
))),
|
||||
);
|
||||
let img_buf = match img_buf {
|
||||
Ok(img_buf) => img_buf,
|
||||
Err(e) => {
|
||||
io.complete(Err(e));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
continue;
|
||||
io.complete(Ok(OnDiskValue::RawImage(img_buf.into_bytes())));
|
||||
}
|
||||
};
|
||||
reconstruct_state.update_key(
|
||||
&meta.meta.key,
|
||||
self.lsn,
|
||||
Value::Image(img_buf.into_bytes()),
|
||||
);
|
||||
|
||||
assert!(ios.is_empty());
|
||||
}
|
||||
Err(err) => {
|
||||
for (_, io) in ios {
|
||||
io.complete(Err(std::io::Error::new(
|
||||
err.kind(),
|
||||
"vec read failed",
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
let kind = err.kind();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice() {
|
||||
reconstruct_state.on_key_error(
|
||||
blob_meta.key,
|
||||
PageReconstructError::from(anyhow!(
|
||||
"Failed to read blobs from virtual file {}: {}",
|
||||
self.file.path(),
|
||||
kind
|
||||
)),
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// keep layer resident until this IO is done; this spawned IO future generally outlives the
|
||||
// call to `self` / the `Arc<DownloadedLayer>` / the `ResidentLayer` that guarantees residency
|
||||
drop(read_extend_residency);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1069,6 +1082,7 @@ impl ImageLayerIterator<'_> {
|
||||
Key::from_slice(&raw_key[..KEY_SIZE]),
|
||||
self.image_layer.lsn,
|
||||
offset,
|
||||
true,
|
||||
) {
|
||||
break batch_plan;
|
||||
}
|
||||
|
||||
@@ -8,23 +8,22 @@ use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::{l0_flush, page_cache};
|
||||
use anyhow::{anyhow, Result};
|
||||
use anyhow::Result;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
||||
use utils::{id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
||||
use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
@@ -36,9 +35,7 @@ use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize};
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use super::{
|
||||
DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
|
||||
};
|
||||
use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
|
||||
|
||||
pub(crate) mod vectored_dio_read;
|
||||
|
||||
@@ -415,10 +412,8 @@ impl InMemoryLayer {
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
//
|
||||
// If the key is cached, go no further than the cached Lsn.
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
self: &Arc<InMemoryLayer>,
|
||||
keyspace: KeySpace,
|
||||
end_lsn: Lsn,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
@@ -435,6 +430,9 @@ impl InMemoryLayer {
|
||||
read: vectored_dio_read::LogicalRead<Vec<u8>>,
|
||||
}
|
||||
let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
|
||||
let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
|
||||
|
||||
let lsn_range = self.start_lsn..end_lsn;
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
for (key, vec_map) in inner
|
||||
@@ -442,12 +440,7 @@ impl InMemoryLayer {
|
||||
.range(range.start.to_compact()..range.end.to_compact())
|
||||
{
|
||||
let key = Key::from_compact(*key);
|
||||
let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
|
||||
Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
|
||||
None => self.start_lsn..end_lsn,
|
||||
};
|
||||
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
let slice = vec_map.slice_range(lsn_range.clone());
|
||||
|
||||
for (entry_lsn, index_entry) in slice.iter().rev() {
|
||||
let IndexEntryUnpacked {
|
||||
@@ -463,55 +456,59 @@ impl InMemoryLayer {
|
||||
Vec::with_capacity(len as usize),
|
||||
),
|
||||
});
|
||||
|
||||
let io = reconstruct_state.update_key(&key, *entry_lsn, will_init);
|
||||
ios.insert((key, *entry_lsn), io);
|
||||
|
||||
if will_init {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
|
||||
let read_from = Arc::clone(self);
|
||||
let read_ctx = ctx.attached_child();
|
||||
reconstruct_state
|
||||
.spawn_io(async move {
|
||||
let inner = read_from.inner.read().await;
|
||||
let f = vectored_dio_read::execute(
|
||||
&inner.file,
|
||||
reads
|
||||
.iter()
|
||||
.flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
|
||||
&read_ctx,
|
||||
);
|
||||
send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
|
||||
.await;
|
||||
|
||||
// Execute the reads.
|
||||
|
||||
let f = vectored_dio_read::execute(
|
||||
&inner.file,
|
||||
reads
|
||||
.iter()
|
||||
.flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
|
||||
&ctx,
|
||||
);
|
||||
send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
|
||||
.await;
|
||||
|
||||
// Process results into the reconstruct state
|
||||
'next_key: for (key, value_reads) in reads {
|
||||
for ValueRead { entry_lsn, read } in value_reads {
|
||||
match read.into_result().expect("we run execute() above") {
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
|
||||
continue 'next_key;
|
||||
}
|
||||
Ok(value_buf) => {
|
||||
let value = Value::des(&value_buf);
|
||||
if let Err(e) = value {
|
||||
reconstruct_state
|
||||
.on_key_error(key, PageReconstructError::from(anyhow!(e)));
|
||||
continue 'next_key;
|
||||
for (key, value_reads) in reads {
|
||||
for ValueRead { entry_lsn, read } in value_reads {
|
||||
let io = ios.remove(&(key, entry_lsn)).expect("sender must exist");
|
||||
match read.into_result().expect("we run execute() above") {
|
||||
Err(e) => {
|
||||
io.complete(Err(std::io::Error::new(
|
||||
e.kind(),
|
||||
"dio vec read failed",
|
||||
)));
|
||||
}
|
||||
Ok(value_buf) => {
|
||||
io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into())));
|
||||
}
|
||||
}
|
||||
|
||||
let key_situation =
|
||||
reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
|
||||
if key_situation == ValueReconstructSituation::Complete {
|
||||
// TODO: metric to see if we fetched more values than necessary
|
||||
continue 'next_key;
|
||||
}
|
||||
|
||||
// process the next value in the next iteration of the loop
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
|
||||
assert!(ios.is_empty());
|
||||
|
||||
// Keep layer existent until this IO is done;
|
||||
// This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
|
||||
// but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
|
||||
// drop for consistency among all three layer types.
|
||||
drop(inner);
|
||||
drop(read_from);
|
||||
})
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -606,6 +603,7 @@ impl InMemoryLayer {
|
||||
// Write the batch to the file
|
||||
inner.file.write_raw(&raw, ctx).await?;
|
||||
let new_size = inner.file.len();
|
||||
|
||||
let expected_new_len = base_offset
|
||||
.checked_add(raw.len().into_u64())
|
||||
// write_raw would error if we were to overflow u64.
|
||||
|
||||
@@ -308,7 +308,7 @@ impl Layer {
|
||||
reconstruct_data: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let layer = self
|
||||
let downloaded = self
|
||||
.0
|
||||
.get_or_maybe_download(true, Some(ctx))
|
||||
.await
|
||||
@@ -318,11 +318,15 @@ impl Layer {
|
||||
}
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
let this = ResidentLayer {
|
||||
downloaded: downloaded.clone(),
|
||||
owner: self.clone(),
|
||||
};
|
||||
|
||||
self.record_access(ctx);
|
||||
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
|
||||
downloaded
|
||||
.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
|
||||
.instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
@@ -1768,25 +1772,25 @@ impl DownloadedLayer {
|
||||
|
||||
async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
this: ResidentLayer,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValuesReconstructState,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
use LayerKind::*;
|
||||
|
||||
match self
|
||||
.get(owner, ctx)
|
||||
.get(&this.owner.0, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?
|
||||
{
|
||||
Delta(d) => {
|
||||
d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
|
||||
d.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
Image(i) => {
|
||||
i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
|
||||
i.get_values_reconstruct_data(this, keyspace, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,10 @@ use super::failpoints::{Failpoint, FailpointKind};
|
||||
use super::*;
|
||||
use crate::{
|
||||
context::DownloadBehavior,
|
||||
tenant::{harness::test_img, storage_layer::LayerVisibilityHint},
|
||||
tenant::{
|
||||
harness::test_img,
|
||||
storage_layer::{IoConcurrency, LayerVisibilityHint},
|
||||
},
|
||||
};
|
||||
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
|
||||
|
||||
@@ -31,6 +34,7 @@ async fn smoke_test() {
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
let (tenant, _) = h.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
|
||||
|
||||
@@ -89,7 +93,7 @@ async fn smoke_test() {
|
||||
};
|
||||
|
||||
let img_before = {
|
||||
let mut data = ValuesReconstructState::default();
|
||||
let mut data = ValuesReconstructState::new(io_concurrency.clone());
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
controlfile_keyspace.clone(),
|
||||
@@ -99,10 +103,13 @@ async fn smoke_test() {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
data.keys
|
||||
.remove(&CONTROLFILE_KEY)
|
||||
.expect("must be present")
|
||||
.expect("should not error")
|
||||
.collect_pending_ios()
|
||||
.await
|
||||
.expect("must not error")
|
||||
.img
|
||||
.take()
|
||||
.expect("tenant harness writes the control file")
|
||||
@@ -121,7 +128,7 @@ async fn smoke_test() {
|
||||
|
||||
// on accesses when the layer is evicted, it will automatically be downloaded.
|
||||
let img_after = {
|
||||
let mut data = ValuesReconstructState::default();
|
||||
let mut data = ValuesReconstructState::new(io_concurrency.clone());
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
controlfile_keyspace.clone(),
|
||||
@@ -135,7 +142,9 @@ async fn smoke_test() {
|
||||
data.keys
|
||||
.remove(&CONTROLFILE_KEY)
|
||||
.expect("must be present")
|
||||
.expect("should not error")
|
||||
.collect_pending_ios()
|
||||
.await
|
||||
.expect("must not error")
|
||||
.img
|
||||
.take()
|
||||
.expect("tenant harness writes the control file")
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user