Merge pull request #8598 from neondatabase/rc/2024-08-05

Storage & Compute release 2024-08-05
This commit is contained in:
Arpad Müller
2024-08-05 14:21:20 +02:00
committed by GitHub
88 changed files with 5767 additions and 1214 deletions

View File

@@ -14,11 +14,8 @@ inputs:
api_host:
description: 'Neon API host'
default: console-stage.neon.build
provisioner:
description: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod'
compute_units:
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
description: '[Min, Max] compute units'
default: '[1, 1]'
outputs:
@@ -37,10 +34,6 @@ runs:
# A shell without `set -x` to not to expose password/dsn in logs
shell: bash -euo pipefail {0}
run: |
if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
fi
project=$(curl \
"https://${API_HOST}/api/v2/projects" \
--fail \
@@ -52,7 +45,7 @@ runs:
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
\"pg_version\": ${POSTGRES_VERSION},
\"region_id\": \"${REGION_ID}\",
\"provisioner\": \"${PROVISIONER}\",
\"provisioner\": \"k8s-neonvm\",
\"autoscaling_limit_min_cu\": ${MIN_CU},
\"autoscaling_limit_max_cu\": ${MAX_CU},
\"settings\": { }
@@ -75,6 +68,5 @@ runs:
API_KEY: ${{ inputs.api_key }}
REGION_ID: ${{ inputs.region_id }}
POSTGRES_VERSION: ${{ inputs.postgres_version }}
PROVISIONER: ${{ inputs.provisioner }}
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}

View File

@@ -19,6 +19,10 @@ on:
description: 'debug or release'
required: true
type: string
pg-versions:
description: 'a json array of postgres versions to run regression tests on'
required: true
type: string
defaults:
run:
@@ -254,7 +258,7 @@ jobs:
strategy:
fail-fast: false
matrix:
pg_version: [ v14, v15, v16 ]
pg_version: ${{ fromJson(inputs.pg-versions) }}
steps:
- uses: actions/checkout@v4
with:
@@ -284,5 +288,5 @@ jobs:
- name: Merge and upload coverage data
if: |
false &&
inputs.build-type == 'debug' && matrix.pg_version == 'v14'
inputs.build-type == 'debug' && matrix.pg_version == 'v16'
uses: ./.github/actions/save-coverage-data

View File

@@ -63,11 +63,9 @@ jobs:
- DEFAULT_PG_VERSION: 16
PLATFORM: "neon-staging"
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
provisioner: 'k8s-pod'
- DEFAULT_PG_VERSION: 16
PLATFORM: "azure-staging"
region_id: 'azure-eastus2'
provisioner: 'k8s-neonvm'
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -100,7 +98,6 @@ jobs:
region_id: ${{ matrix.region_id }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
provisioner: ${{ matrix.provisioner }}
- name: Run benchmark
uses: ./.github/actions/run-python-test-set
@@ -216,11 +213,11 @@ jobs:
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
#
# Available platforms:
# - neon-captest-new: Freshly created project (1 CU)
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
# - neonvm-captest-new: Freshly created project (1 CU)
# - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
# - neon-captest-reuse: Reusing existing project
# - neonvm-captest-reuse: Reusing existing project
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
env:
@@ -245,18 +242,16 @@ jobs:
"'"$region_id_default"'"
],
"platform": [
"neon-captest-new",
"neon-captest-reuse",
"neonvm-captest-new",
"neonvm-captest-reuse",
"neonvm-captest-new"
],
"db_size": [ "10gb" ],
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
}'
@@ -271,7 +266,7 @@ jobs:
run: |
matrix='{
"platform": [
"neon-captest-reuse"
"neonvm-captest-reuse"
]
}'
@@ -287,7 +282,7 @@ jobs:
run: |
matrix='{
"platform": [
"neon-captest-reuse"
"neonvm-captest-reuse"
],
"scale": [
"10"
@@ -338,7 +333,7 @@ jobs:
prefix: latest
- name: Create Neon Project
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
@@ -346,19 +341,18 @@ jobs:
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
- name: Set up Connection String
id: set-up-connstr
run: |
case "${PLATFORM}" in
neon-captest-reuse)
neonvm-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
;;
neonvm-captest-sharding-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
;;
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;;
rds-aurora)
@@ -442,9 +436,9 @@ jobs:
fail-fast: false
matrix:
include:
- PLATFORM: "neon-captest-pgvector"
- PLATFORM: "neonvm-captest-pgvector"
- PLATFORM: "azure-captest-pgvector"
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -486,7 +480,7 @@ jobs:
id: set-up-connstr
run: |
case "${PLATFORM}" in
neon-captest-pgvector)
neonvm-captest-pgvector)
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
;;
azure-captest-pgvector)
@@ -585,7 +579,7 @@ jobs:
id: set-up-connstr
run: |
case "${PLATFORM}" in
neon-captest-reuse)
neonvm-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
;;
rds-aurora)
@@ -595,7 +589,7 @@ jobs:
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
@@ -672,7 +666,7 @@ jobs:
- name: Get Connstring Secret Name
run: |
case "${PLATFORM}" in
neon-captest-reuse)
neonvm-captest-reuse)
ENV_PLATFORM=CAPTEST_TPCH
;;
rds-aurora)
@@ -682,7 +676,7 @@ jobs:
ENV_PLATFORM=RDS_AURORA_TPCH
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
@@ -759,7 +753,7 @@ jobs:
id: set-up-connstr
run: |
case "${PLATFORM}" in
neon-captest-reuse)
neonvm-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
;;
rds-aurora)
@@ -769,7 +763,7 @@ jobs:
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac

View File

@@ -203,7 +203,8 @@ jobs:
fail-fast: false
matrix:
arch: [ x64 ]
build-type: [ debug, release ]
# Do not build or run tests in debug for release branches
build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
include:
- build-type: release
arch: arm64
@@ -213,6 +214,8 @@ jobs:
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
build-tag: ${{ needs.tag.outputs.build-tag }}
build-type: ${{ matrix.build-type }}
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
secrets: inherit
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -306,7 +309,7 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
create-test-report:
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
outputs:
report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -833,6 +836,9 @@ jobs:
rm -rf .docker-custom
promote-images:
permissions:
contents: read # This is required for actions/checkout
id-token: write # This is required for Azure Login to work.
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
runs-on: ubuntu-22.04
@@ -859,6 +865,28 @@ jobs:
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
- name: Azure login
if: github.ref_name == 'main'
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
with:
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
- name: Login to ACR
if: github.ref_name == 'main'
run: |
az acr login --name=neoneastus2
- name: Copy docker images to ACR-dev
if: github.ref_name == 'main'
run: |
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
docker buildx imagetools create \
-t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
done
- name: Add latest tag to images
if: github.ref_name == 'main'
run: |

View File

@@ -13,6 +13,7 @@ on:
paths:
- '.github/workflows/pg-clients.yml'
- 'test_runner/pg_clients/**'
- 'test_runner/logical_repl/**'
- 'poetry.lock'
workflow_dispatch:
@@ -49,6 +50,77 @@ jobs:
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
secrets: inherit
test-logical-replication:
needs: [ build-build-tools-image ]
runs-on: ubuntu-22.04
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init --user root
services:
clickhouse:
image: clickhouse/clickhouse-server:24.6.3.64
ports:
- 9000:9000
- 8123:8123
steps:
- uses: actions/checkout@v4
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Create Neon Project
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
- name: Run tests
uses: ./.github/actions/run-python-test-set
with:
build_type: remote
test_selection: logical_repl
run_in_parallel: false
extra_params: -m remote_cluster
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
- name: Delete Neon Project
if: always()
uses: ./.github/actions/neon-project-delete
with:
project_id: ${{ steps.create-neon-project.outputs.project_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
if: ${{ !cancelled() }}
id: create-allure-report
uses: ./.github/actions/allure-report-generate
with:
store-test-results-into-db: true
env:
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
- name: Post to a Slack channel
if: github.event.schedule && failure()
uses: slackapi/slack-github-action@v1
with:
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
slack-message: |
Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
test-postgres-client-libs:
needs: [ build-build-tools-image ]
runs-on: ubuntu-22.04

188
Cargo.lock generated
View File

@@ -1418,7 +1418,7 @@ dependencies = [
"clap",
"criterion-plot",
"is-terminal",
"itertools",
"itertools 0.10.5",
"num-traits",
"once_cell",
"oorandom",
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
dependencies = [
"cast",
"itertools",
"itertools 0.10.5",
]
[[package]]
@@ -1672,6 +1672,7 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
dependencies = [
"bitflags 2.4.1",
"byteorder",
"chrono",
"diesel_derives",
"itoa",
"pq-sys",
@@ -2133,6 +2134,12 @@ dependencies = [
"slab",
]
[[package]]
name = "gen_ops"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
[[package]]
name = "generic-array"
version = "0.14.7"
@@ -2709,17 +2716,6 @@ version = "3.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
[[package]]
name = "io-lifetimes"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
dependencies = [
"hermit-abi",
"libc",
"windows-sys 0.48.0",
]
[[package]]
name = "io-uring"
version = "0.6.2"
@@ -2738,14 +2734,13 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
[[package]]
name = "is-terminal"
version = "0.4.7"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
dependencies = [
"hermit-abi",
"io-lifetimes",
"rustix 0.37.25",
"windows-sys 0.48.0",
"libc",
"windows-sys 0.52.0",
]
[[package]]
@@ -2757,6 +2752,15 @@ dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.6"
@@ -2871,18 +2875,6 @@ version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "linux-raw-sys"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
[[package]]
name = "linux-raw-sys"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
[[package]]
name = "linux-raw-sys"
version = "0.4.13"
@@ -3000,7 +2992,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
dependencies = [
"libc",
"measured",
"procfs 0.16.0",
"procfs",
]
[[package]]
@@ -3045,7 +3037,7 @@ dependencies = [
"measured",
"measured-process",
"once_cell",
"procfs 0.14.2",
"procfs",
"prometheus",
"rand 0.8.5",
"rand_distr",
@@ -3574,7 +3566,7 @@ dependencies = [
"humantime",
"humantime-serde",
"hyper 0.14.26",
"itertools",
"itertools 0.10.5",
"leaky-bucket",
"md5",
"metrics",
@@ -3592,8 +3584,9 @@ dependencies = [
"postgres_connection",
"postgres_ffi",
"pq_proto",
"procfs 0.14.2",
"procfs",
"rand 0.8.5",
"range-set-blaze",
"regex",
"remote_storage",
"reqwest 0.12.4",
@@ -3644,7 +3637,7 @@ dependencies = [
"hex",
"humantime",
"humantime-serde",
"itertools",
"itertools 0.10.5",
"postgres_ffi",
"rand 0.8.5",
"serde",
@@ -3702,7 +3695,7 @@ dependencies = [
"hex-literal",
"humantime",
"humantime-serde",
"itertools",
"itertools 0.10.5",
"metrics",
"once_cell",
"pageserver_api",
@@ -4034,7 +4027,7 @@ name = "postgres_connection"
version = "0.1.0"
dependencies = [
"anyhow",
"itertools",
"itertools 0.10.5",
"once_cell",
"postgres",
"tokio-postgres",
@@ -4092,7 +4085,7 @@ version = "0.1.0"
dependencies = [
"byteorder",
"bytes",
"itertools",
"itertools 0.10.5",
"pin-project-lite",
"postgres-protocol",
"rand 0.8.5",
@@ -4138,21 +4131,6 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "procfs"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
dependencies = [
"bitflags 1.3.2",
"byteorder",
"chrono",
"flate2",
"hex",
"lazy_static",
"rustix 0.36.16",
]
[[package]]
name = "procfs"
version = "0.16.0"
@@ -4160,10 +4138,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
dependencies = [
"bitflags 2.4.1",
"chrono",
"flate2",
"hex",
"lazy_static",
"procfs-core",
"rustix 0.38.28",
"rustix",
]
[[package]]
@@ -4173,14 +4153,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
dependencies = [
"bitflags 2.4.1",
"chrono",
"hex",
]
[[package]]
name = "prometheus"
version = "0.13.3"
version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
dependencies = [
"cfg-if",
"fnv",
@@ -4188,7 +4169,7 @@ dependencies = [
"libc",
"memchr",
"parking_lot 0.12.1",
"procfs 0.14.2",
"procfs",
"thiserror",
]
@@ -4210,7 +4191,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
dependencies = [
"bytes",
"heck 0.4.1",
"itertools",
"itertools 0.10.5",
"lazy_static",
"log",
"multimap",
@@ -4231,7 +4212,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
dependencies = [
"anyhow",
"itertools",
"itertools 0.10.5",
"proc-macro2",
"quote",
"syn 1.0.109",
@@ -4288,7 +4269,7 @@ dependencies = [
"hyper-util",
"indexmap 2.0.1",
"ipnet",
"itertools",
"itertools 0.10.5",
"lasso",
"md5",
"measured",
@@ -4464,6 +4445,18 @@ dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "range-set-blaze"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
dependencies = [
"gen_ops",
"itertools 0.12.1",
"num-integer",
"num-traits",
]
[[package]]
name = "rayon"
version = "1.7.0"
@@ -4632,7 +4625,7 @@ dependencies = [
"humantime",
"humantime-serde",
"hyper 0.14.26",
"itertools",
"itertools 0.10.5",
"metrics",
"once_cell",
"pin-project-lite",
@@ -4942,34 +4935,6 @@ dependencies = [
"nom",
]
[[package]]
name = "rustix"
version = "0.36.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
dependencies = [
"bitflags 1.3.2",
"errno",
"io-lifetimes",
"libc",
"linux-raw-sys 0.1.4",
"windows-sys 0.45.0",
]
[[package]]
name = "rustix"
version = "0.37.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
dependencies = [
"bitflags 1.3.2",
"errno",
"io-lifetimes",
"libc",
"linux-raw-sys 0.3.8",
"windows-sys 0.48.0",
]
[[package]]
name = "rustix"
version = "0.38.28"
@@ -5718,6 +5683,7 @@ dependencies = [
"aws-config",
"bytes",
"camino",
"chrono",
"clap",
"control_plane",
"diesel",
@@ -5728,7 +5694,7 @@ dependencies = [
"hex",
"humantime",
"hyper 0.14.26",
"itertools",
"itertools 0.10.5",
"lasso",
"measured",
"metrics",
@@ -5737,6 +5703,7 @@ dependencies = [
"pageserver_client",
"postgres_connection",
"r2d2",
"rand 0.8.5",
"reqwest 0.12.4",
"routerify",
"scopeguard",
@@ -5792,9 +5759,10 @@ dependencies = [
"either",
"futures",
"futures-util",
"git-version",
"hex",
"humantime",
"itertools",
"itertools 0.10.5",
"once_cell",
"pageserver",
"pageserver_api",
@@ -5971,15 +5939,15 @@ dependencies = [
[[package]]
name = "tempfile"
version = "3.5.0"
version = "3.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
dependencies = [
"cfg-if",
"fastrand 1.9.0",
"redox_syscall 0.3.5",
"rustix 0.37.25",
"windows-sys 0.45.0",
"fastrand 2.0.0",
"redox_syscall 0.4.1",
"rustix",
"windows-sys 0.52.0",
]
[[package]]
@@ -7176,15 +7144,6 @@ dependencies = [
"windows_x86_64_msvc 0.42.2",
]
[[package]]
name = "windows-sys"
version = "0.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
dependencies = [
"windows-targets 0.42.2",
]
[[package]]
name = "windows-sys"
version = "0.48.0"
@@ -7203,21 +7162,6 @@ dependencies = [
"windows-targets 0.52.4",
]
[[package]]
name = "windows-targets"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
dependencies = [
"windows_aarch64_gnullvm 0.42.2",
"windows_aarch64_msvc 0.42.2",
"windows_i686_gnu 0.42.2",
"windows_i686_msvc 0.42.2",
"windows_x86_64_gnu 0.42.2",
"windows_x86_64_gnullvm 0.42.2",
"windows_x86_64_msvc 0.42.2",
]
[[package]]
name = "windows-targets"
version = "0.48.0"
@@ -7447,7 +7391,7 @@ dependencies = [
"hmac",
"hyper 0.14.26",
"indexmap 1.9.3",
"itertools",
"itertools 0.10.5",
"libc",
"log",
"memchr",

View File

@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "51.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
procfs = "0.14"
procfs = "0.16"
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
rand = "0.8"

View File

@@ -4,6 +4,11 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
# Enables test specific features.
testing = []
[dependencies]
anyhow.workspace = true
async-compression.workspace = true

View File

@@ -400,7 +400,15 @@ impl ComputeNode {
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let mut retry_period_ms = 500.0;
let mut attempts = 0;
let max_attempts = 10;
const DEFAULT_ATTEMPTS: u16 = 10;
#[cfg(feature = "testing")]
let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
u16::from_str(&v).unwrap()
} else {
DEFAULT_ATTEMPTS
};
#[cfg(not(feature = "testing"))]
let max_attempts = DEFAULT_ATTEMPTS;
loop {
let result = self.try_get_basebackup(compute_state, lsn);
match result {

View File

@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
for (var, val) in std::env::vars() {
if var.starts_with("NEON_PAGESERVER_") {
if var.starts_with("NEON_") {
cmd = cmd.env(var, val);
}
}

View File

@@ -1,5 +1,6 @@
use std::collections::HashSet;
use std::str::FromStr;
use std::time::Instant;
use std::time::{Duration, Instant};
/// Request/response types for the storage controller
/// API (`/control/v1` prefix). Implemented by the server
@@ -294,6 +295,42 @@ pub enum PlacementPolicy {
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateResponse {}
/// Metadata health record posted from scrubber.
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthRecord {
pub tenant_shard_id: TenantShardId,
pub healthy: bool,
pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthUpdateRequest {
pub healthy_tenant_shards: HashSet<TenantShardId>,
pub unhealthy_tenant_shards: HashSet<TenantShardId>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthUpdateResponse {}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListUnhealthyResponse {
pub unhealthy_tenant_shards: Vec<TenantShardId>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListOutdatedRequest {
#[serde(with = "humantime_serde")]
pub not_scrubbed_for: Duration,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListOutdatedResponse {
pub health_records: Vec<MetadataHealthRecord>,
}
#[cfg(test)]
mod test {
use super::*;

View File

@@ -355,7 +355,8 @@ impl RemoteStorage for AzureBlobStorage {
.blobs()
.map(|k| ListingObject{
key: self.name_to_relative_path(&k.name),
last_modified: k.properties.last_modified.into()
last_modified: k.properties.last_modified.into(),
size: k.properties.content_length,
}
);

View File

@@ -144,6 +144,7 @@ impl RemotePath {
///
/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
/// NoDelimiter mode will only populate `keys`.
#[derive(Copy, Clone)]
pub enum ListingMode {
WithDelimiter,
NoDelimiter,
@@ -153,6 +154,7 @@ pub enum ListingMode {
pub struct ListingObject {
pub key: RemotePath,
pub last_modified: SystemTime,
pub size: u64,
}
#[derive(Default)]
@@ -194,7 +196,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>>;
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
async fn list(
&self,
@@ -351,10 +353,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &'a CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
match self {
Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),

View File

@@ -368,6 +368,7 @@ impl RemoteStorage for LocalFs {
key: k.clone(),
// LocalFs is just for testing, so just specify a dummy time
last_modified: SystemTime::now(),
size: 0,
})
}
})
@@ -411,6 +412,7 @@ impl RemoteStorage for LocalFs {
key: RemotePath::from_string(&relative_key).unwrap(),
// LocalFs is just for testing
last_modified: SystemTime::now(),
size: 0,
});
}
}

View File

@@ -565,9 +565,12 @@ impl RemoteStorage for S3Bucket {
}
};
let size = object.size.unwrap_or(0) as u64;
result.keys.push(ListingObject{
key,
last_modified
last_modified,
size,
});
if let Some(mut mk) = max_keys {
assert!(mk > 0);

View File

@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> {
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
async_stream::stream! {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
.map_err(DownloadError::Other)?;

View File

@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum Scope {
// Provides access to all data for a specific tenant (specified in `struct Claims` below)
/// Provides access to all data for a specific tenant (specified in `struct Claims` below)
// TODO: join these two?
Tenant,
// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
// Should only be used e.g. for status check/tenant creation/list.
/// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
/// Should only be used e.g. for status check/tenant creation/list.
PageServerApi,
// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
// Should only be used e.g. for status check.
// Currently also used for connection from any pageserver to any safekeeper.
/// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
/// Should only be used e.g. for status check.
/// Currently also used for connection from any pageserver to any safekeeper.
SafekeeperData,
// The scope used by pageservers in upcalls to storage controller and cloud control plane
/// The scope used by pageservers in upcalls to storage controller and cloud control plane
#[serde(rename = "generations_api")]
GenerationsApi,
// Allows access to control plane managment API and some storage controller endpoints.
/// Allows access to control plane managment API and some storage controller endpoints.
Admin,
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state

View File

@@ -49,6 +49,7 @@ postgres_backend.workspace = true
postgres-protocol.workspace = true
postgres-types.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true
scopeguard.workspace = true
serde.workspace = true

View File

@@ -1,3 +1,4 @@
use criterion::measurement::WallTime;
use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap;
@@ -15,7 +16,11 @@ use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
fn fixture_path(relative: &str) -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
}
fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
let mut layer_map = LayerMap::default();
@@ -109,7 +114,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
// between each test run.
fn bench_from_captest_env(c: &mut Criterion) {
// TODO consider compressing this file
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
// Test with uniform query pattern
@@ -139,7 +144,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
fn bench_from_real_project(c: &mut Criterion) {
// Init layer map
let now = Instant::now();
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
println!("Finished layer map init in {:?}", now.elapsed());
// Choose uniformly distributed queries
@@ -242,7 +247,72 @@ fn bench_sequential(c: &mut Criterion) {
group.finish();
}
fn bench_visibility_with_map(
group: &mut BenchmarkGroup<WallTime>,
layer_map: LayerMap,
read_points: Vec<Lsn>,
bench_name: &str,
) {
group.bench_function(bench_name, |b| {
b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
});
}
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
fn bench_visibility(c: &mut Criterion) {
let mut group = c.benchmark_group("visibility");
{
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
let now = Instant::now();
let mut layer_map = LayerMap::default();
let mut updates = layer_map.batch_update();
for i in 0..100_000 {
let i32 = (i as u32) % 100;
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
let layer = PersistentLayerDesc::new_img(
TenantShardId::unsharded(TenantId::generate()),
TimelineId::generate(),
zero.add(10 * i32)..zero.add(10 * i32 + 1),
Lsn(i),
0,
);
updates.insert_historic(layer);
}
updates.flush();
println!("Finished layer map init in {:?}", now.elapsed());
let mut read_points = Vec::new();
for i in (0..100_000).step_by(1000) {
read_points.push(Lsn(i));
}
bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
}
{
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
let read_points = vec![Lsn(0x1C760FA190)];
bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
let read_points = vec![
Lsn(0x1C760FA190),
Lsn(0x000000931BEAD539),
Lsn(0x000000931BF63011),
Lsn(0x000000931B33AE68),
Lsn(0x00000038E67ABFA0),
Lsn(0x000000931B33AE68),
Lsn(0x000000914E3F38F0),
Lsn(0x000000931B33AE68),
];
bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
}
group.finish();
}
criterion_group!(group_1, bench_from_captest_env);
criterion_group!(group_2, bench_from_real_project);
criterion_group!(group_3, bench_sequential);
criterion_main!(group_1, group_2, group_3);
criterion_group!(group_4, bench_visibility);
criterion_main!(group_1, group_2, group_3, group_4);

View File

@@ -17,11 +17,9 @@ use pageserver::config::PageserverIdentity;
use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
use pageserver::tenant::{secondary, TenantSharedResources};
use pageserver::{
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
};
use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
use remote_storage::GenericRemoteStorage;
use tokio::signal::unix::SignalKind;
use tokio::time::Instant;
@@ -31,11 +29,9 @@ use tracing::*;
use metrics::set_build_info_metric;
use pageserver::{
config::PageServerConf,
context::{DownloadBehavior, RequestContext},
deletion_queue::DeletionQueue,
http, page_cache, page_service, task_mgr,
task_mgr::TaskKind,
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
tenant::mgr,
virtual_file,
};
@@ -129,6 +125,7 @@ fn main() -> anyhow::Result<()> {
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
info!(?conf.get_impl, "starting with get page implementation");
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
let tenants_path = conf.tenants_path();
if !tenants_path.exists() {
@@ -593,30 +590,13 @@ fn start_pageserver(
// Spawn a task to listen for libpq connections. It will spawn further tasks
// for each connection. We created the listener earlier already.
let libpq_listener = {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
TaskKind::LibpqEndpointListener,
// listener task shouldn't need to download anything. (We will
// create a separate sub-contexts for each connection, with their
// own download behavior. This context is used only to listen and
// accept connections.)
DownloadBehavior::Error,
);
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"libpq listener",
page_service::libpq_listener_main(
tenant_manager.clone(),
pg_auth,
pageserver_listener,
conf.pg_auth_type,
libpq_ctx,
cancel.clone(),
),
));
LibpqEndpointListener(CancellableTask { task, cancel })
};
let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
pageserver_listener
.set_nonblocking(true)
.context("set listener to nonblocking")?;
tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
});
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
@@ -644,7 +624,7 @@ fn start_pageserver(
shutdown_pageserver.take();
pageserver::shutdown_pageserver(
http_endpoint_listener,
libpq_listener,
page_service,
consumption_metrics_tasks,
disk_usage_eviction_task,
&tenant_manager,

View File

@@ -29,6 +29,7 @@ use utils::{
logging::LogFormat,
};
use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -295,6 +296,10 @@ pub struct PageServerConf {
pub ephemeral_bytes_per_memory_kb: usize,
pub l0_flush: L0FlushConfig,
/// This flag is temporary and will be removed after gradual rollout.
/// See <https://github.com/neondatabase/neon/issues/8184>.
pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -401,6 +406,8 @@ struct PageServerConfigBuilder {
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
l0_flush: BuilderValue<L0FlushConfig>,
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
}
impl PageServerConfigBuilder {
@@ -490,6 +497,7 @@ impl PageServerConfigBuilder {
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
l0_flush: Set(L0FlushConfig::default()),
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
}
}
}
@@ -673,6 +681,10 @@ impl PageServerConfigBuilder {
self.l0_flush = BuilderValue::Set(value);
}
pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
self.compact_level0_phase1_value_access = BuilderValue::Set(value);
}
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
let default = Self::default_values();
@@ -730,6 +742,7 @@ impl PageServerConfigBuilder {
image_compression,
ephemeral_bytes_per_memory_kb,
l0_flush,
compact_level0_phase1_value_access,
}
CUSTOM LOGIC
{
@@ -1002,6 +1015,9 @@ impl PageServerConf {
"l0_flush" => {
builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
}
"compact_level0_phase1_value_access" => {
builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
}
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -1086,6 +1102,7 @@ impl PageServerConf {
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
}
}
}
@@ -1327,6 +1344,7 @@ background_task_maximum_delay = '334 s'
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
},
"Correct defaults should be used when no config values are provided"
);
@@ -1401,6 +1419,7 @@ background_task_maximum_delay = '334 s'
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -296,6 +296,11 @@ impl From<GetActiveTenantError> for ApiError {
GetActiveTenantError::WaitForActiveTimeout { .. } => {
ApiError::ResourceUnavailable(format!("{}", e).into())
}
GetActiveTenantError::SwitchedTenant => {
// in our HTTP handlers, this error doesn't happen
// TODO: separate error types
ApiError::ResourceUnavailable("switched tenant".into())
}
}
}
}
@@ -2129,14 +2134,24 @@ async fn secondary_download_handler(
let timeout = wait.unwrap_or(Duration::MAX);
let status = match tokio::time::timeout(
let result = tokio::time::timeout(
timeout,
state.secondary_controller.download_tenant(tenant_shard_id),
)
.await
{
// Download job ran to completion.
Ok(Ok(())) => StatusCode::OK,
.await;
let progress = secondary_tenant.progress.lock().unwrap().clone();
let status = match result {
Ok(Ok(())) => {
if progress.layers_downloaded >= progress.layers_total {
// Download job ran to completion
StatusCode::OK
} else {
// Download dropped out without errors because it ran out of time budget
StatusCode::ACCEPTED
}
}
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
// okay. We could get an error here in the unlikely edge case that the tenant
// was detached between our check above and executing the download job.
@@ -2146,8 +2161,6 @@ async fn secondary_download_handler(
Err(_) => StatusCode::ACCEPTED,
};
let progress = secondary_tenant.progress.lock().unwrap().clone();
json_response(status, progress)
}

View File

@@ -2,13 +2,23 @@ use std::{num::NonZeroUsize, sync::Arc};
use crate::tenant::ephemeral_file;
#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
pub enum L0FlushConfig {
#[default]
PageCached,
#[serde(rename_all = "snake_case")]
Direct { max_concurrency: NonZeroUsize },
Direct {
max_concurrency: NonZeroUsize,
},
}
impl Default for L0FlushConfig {
fn default() -> Self {
Self::Direct {
// TODO: using num_cpus results in different peak memory usage on different instance types.
max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
}
}
}
#[derive(Clone)]

View File

@@ -12,6 +12,8 @@ pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub mod l0_flush;
use futures::{stream::FuturesUnordered, StreamExt};
pub use pageserver_api::keyspace;
use tokio_util::sync::CancellationToken;
pub mod aux_file;
@@ -30,14 +32,13 @@ pub mod walingest;
pub mod walrecord;
pub mod walredo;
use crate::task_mgr::TaskKind;
use camino::Utf8Path;
use deletion_queue::DeletionQueue;
use tenant::{
mgr::{BackgroundPurges, TenantManager},
secondary,
};
use tracing::info;
use tracing::{info, info_span};
/// Current storage format version
///
@@ -63,7 +64,6 @@ pub struct CancellableTask {
pub cancel: CancellationToken,
}
pub struct HttpEndpointListener(pub CancellableTask);
pub struct LibpqEndpointListener(pub CancellableTask);
pub struct ConsumptionMetricsTasks(pub CancellableTask);
pub struct DiskUsageEvictionTask(pub CancellableTask);
impl CancellableTask {
@@ -77,7 +77,7 @@ impl CancellableTask {
#[allow(clippy::too_many_arguments)]
pub async fn shutdown_pageserver(
http_listener: HttpEndpointListener,
libpq_listener: LibpqEndpointListener,
page_service: page_service::Listener,
consumption_metrics_worker: ConsumptionMetricsTasks,
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
tenant_manager: &TenantManager,
@@ -87,10 +87,83 @@ pub async fn shutdown_pageserver(
exit_code: i32,
) {
use std::time::Duration;
// If the orderly shutdown below takes too long, we still want to make
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
//
// (Leftover walredo processes are the hypothesized trigger for the systemd freezes
// that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
//
// We use a thread instead of a tokio task because the background runtime is likely busy
// with the final flushing / uploads. This activity here has priority, and due to lack
// of scheduling priority feature sin the tokio scheduler, using a separate thread is
// an effective priority booster.
let walredo_extraordinary_shutdown_thread_span = {
let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
span.follows_from(tracing::Span::current());
span
};
let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
let walredo_extraordinary_shutdown_thread = std::thread::spawn({
let walredo_extraordinary_shutdown_thread_cancel =
walredo_extraordinary_shutdown_thread_cancel.clone();
move || {
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let _entered = rt.enter();
let _entered = walredo_extraordinary_shutdown_thread_span.enter();
if let Ok(()) = rt.block_on(tokio::time::timeout(
Duration::from_secs(8),
walredo_extraordinary_shutdown_thread_cancel.cancelled(),
)) {
info!("cancellation requested");
return;
}
let managers = tenant::WALREDO_MANAGERS
.lock()
.unwrap()
// prevents new walredo managers from being inserted
.take()
.expect("only we take()");
// Use FuturesUnordered to get in queue early for each manager's
// heavier_once_cell semaphore wait list.
// Also, for idle tenants that for some reason haven't
// shut down yet, it's quite likely that we're not going
// to get Poll::Pending once.
let mut futs: FuturesUnordered<_> = managers
.into_iter()
.filter_map(|(_, mgr)| mgr.upgrade())
.map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
.collect();
info!(count=%futs.len(), "built FuturesUnordered");
let mut last_log_at = std::time::Instant::now();
#[derive(Debug, Default)]
struct Results {
initiated: u64,
already: u64,
}
let mut results = Results::default();
while let Some(we_initiated) = rt.block_on(futs.next()) {
if we_initiated {
results.initiated += 1;
} else {
results.already += 1;
}
if last_log_at.elapsed() > Duration::from_millis(100) {
info!(remaining=%futs.len(), ?results, "progress");
last_log_at = std::time::Instant::now();
}
}
info!(?results, "done");
}
});
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
timed(
libpq_listener.0.shutdown(),
let remaining_connections = timed(
page_service.stop_accepting(),
"shutdown LibpqEndpointListener",
Duration::from_secs(1),
)
@@ -108,7 +181,7 @@ pub async fn shutdown_pageserver(
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
// should already have been canclled via mgr::shutdown_all_tenants
timed(
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
remaining_connections.shutdown(),
"shutdown PageRequestHandlers",
Duration::from_secs(1),
)
@@ -162,6 +235,12 @@ pub async fn shutdown_pageserver(
Duration::from_secs(1),
)
.await;
info!("cancel & join walredo_extraordinary_shutdown_thread");
walredo_extraordinary_shutdown_thread_cancel.cancel();
walredo_extraordinary_shutdown_thread.join().unwrap();
info!("walredo_extraordinary_shutdown_thread done");
info!("Shut down successfully completed");
std::process::exit(exit_code);
}

View File

@@ -525,6 +525,15 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_visible_physical_size",
"The size of the layer files present in the pageserver's filesystem.",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_resident_physical_size_global",
@@ -613,7 +622,23 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_in_bytes_total",
"Size of uncompressed data written into image layers"
"Size of data written into image layers before compression"
)
.expect("failed to define a metric")
});
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_in_bytes_considered",
"Size of potentially compressible data written into image layers before compression"
)
.expect("failed to define a metric")
});
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_in_bytes_chosen",
"Size of data whose compressed form was written into image layers"
)
.expect("failed to define a metric")
});
@@ -2188,6 +2213,7 @@ pub(crate) struct TimelineMetrics {
pub(crate) layer_count_delta: UIntGauge,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
pub visible_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub aux_file_size_gauge: IntGauge,
@@ -2310,6 +2336,9 @@ impl TimelineMetrics {
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
// TODO: we shouldn't expose this metric
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2364,6 +2393,7 @@ impl TimelineMetrics {
layer_count_delta,
standby_horizon_gauge,
resident_physical_size_gauge,
visible_physical_size_gauge,
current_logical_size_gauge,
aux_file_size_gauge,
directory_entries_count_gauge,
@@ -2415,6 +2445,7 @@ impl TimelineMetrics {
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
}
let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);

File diff suppressed because it is too large Load Diff

View File

@@ -8,8 +8,7 @@ use std::time::Duration;
pub use pageserver_api::key::{Key, KEY_SIZE};
/// A 'value' stored for a one Key.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(test, derive(PartialEq))]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum Value {
/// An Image value contains a full copy of the value
Image(Bytes),

View File

@@ -33,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
use remote_storage::TimeoutOrCancel;
use std::collections::BTreeMap;
use std::fmt;
use std::sync::Weak;
use std::time::SystemTime;
use storage_broker::BrokerClientChannel;
use tokio::io::BufReader;
@@ -312,14 +313,66 @@ impl std::fmt::Debug for Tenant {
}
pub(crate) enum WalRedoManager {
Prod(PostgresRedoManager),
Prod(WalredoManagerId, PostgresRedoManager),
#[cfg(test)]
Test(harness::TestRedoManager),
}
impl From<PostgresRedoManager> for WalRedoManager {
fn from(mgr: PostgresRedoManager) -> Self {
Self::Prod(mgr)
#[derive(thiserror::Error, Debug)]
#[error("pageserver is shutting down")]
pub(crate) struct GlobalShutDown;
impl WalRedoManager {
pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
let id = WalredoManagerId::next();
let arc = Arc::new(Self::Prod(id, mgr));
let mut guard = WALREDO_MANAGERS.lock().unwrap();
match &mut *guard {
Some(map) => {
map.insert(id, Arc::downgrade(&arc));
Ok(arc)
}
None => Err(GlobalShutDown),
}
}
}
impl Drop for WalRedoManager {
fn drop(&mut self) {
match self {
Self::Prod(id, _) => {
let mut guard = WALREDO_MANAGERS.lock().unwrap();
if let Some(map) = &mut *guard {
map.remove(id).expect("new() registers, drop() unregisters");
}
}
#[cfg(test)]
Self::Test(_) => {
// Not applicable to test redo manager
}
}
}
}
/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
/// the walredo processes outside of the regular order.
///
/// This is necessary to work around a systemd bug where it freezes if there are
/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
#[allow(clippy::type_complexity)]
pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
pub(crate) struct WalredoManagerId(u64);
impl WalredoManagerId {
pub fn next() -> Self {
static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
if id == 0 {
panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
}
Self(id)
}
}
@@ -331,19 +384,20 @@ impl From<harness::TestRedoManager> for WalRedoManager {
}
impl WalRedoManager {
pub(crate) async fn shutdown(&self) {
pub(crate) async fn shutdown(&self) -> bool {
match self {
Self::Prod(mgr) => mgr.shutdown().await,
Self::Prod(_, mgr) => mgr.shutdown().await,
#[cfg(test)]
Self::Test(_) => {
// Not applicable to test redo manager
true
}
}
}
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
match self {
Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
#[cfg(test)]
Self::Test(_) => {
// Not applicable to test redo manager
@@ -363,7 +417,7 @@ impl WalRedoManager {
pg_version: u32,
) -> Result<bytes::Bytes, walredo::Error> {
match self {
Self::Prod(mgr) => {
Self::Prod(_, mgr) => {
mgr.request_redo(key, lsn, base_img, records, pg_version)
.await
}
@@ -377,7 +431,7 @@ impl WalRedoManager {
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
match self {
WalRedoManager::Prod(m) => Some(m.status()),
WalRedoManager::Prod(_, m) => Some(m.status()),
#[cfg(test)]
WalRedoManager::Test(_) => None,
}
@@ -386,6 +440,8 @@ impl WalRedoManager {
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
pub enum GetTimelineError {
#[error("Timeline is shutting down")]
ShuttingDown,
#[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
NotActive {
tenant_id: TenantShardId,
@@ -675,11 +731,9 @@ impl Tenant {
init_order: Option<InitializationOrder>,
mode: SpawnMode,
ctx: &RequestContext,
) -> Arc<Tenant> {
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
conf,
tenant_shard_id,
)));
) -> Result<Arc<Tenant>, GlobalShutDown> {
let wal_redo_manager =
WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
let TenantSharedResources {
broker_client,
@@ -878,7 +932,7 @@ impl Tenant {
}
.instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
);
tenant
Ok(tenant)
}
#[instrument(skip_all)]
@@ -1580,7 +1634,7 @@ impl Tenant {
self: Arc<Self>,
timeline_id: TimelineId,
) -> Result<(), DeleteTimelineError> {
DeleteTimelineFlow::run(&self, timeline_id, false).await?;
DeleteTimelineFlow::run(&self, timeline_id).await?;
Ok(())
}
@@ -6909,7 +6963,11 @@ mod tests {
vec![
// Image layer at GC horizon
PersistentLayerKey {
key_range: Key::MIN..Key::MAX,
key_range: {
let mut key = Key::MAX;
key.field6 -= 1;
Key::MIN..key
},
lsn_range: Lsn(0x30)..Lsn(0x31),
is_delta: false
},
@@ -6928,6 +6986,15 @@ mod tests {
]
);
// increase GC horizon and compact again
{
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Lsn(0x40);
guard.cutoffs.space = Lsn(0x40);
}
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
Ok(())
}
@@ -7279,6 +7346,15 @@ mod tests {
);
}
// increase GC horizon and compact again
{
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Lsn(0x40);
guard.cutoffs.space = Lsn(0x40);
}
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
Ok(())
}
@@ -7347,6 +7423,7 @@ mod tests {
Lsn(0x60),
&[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
3,
None,
)
.await
.unwrap();
@@ -7471,7 +7548,7 @@ mod tests {
),
];
let res = tline
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
.await
.unwrap();
let expected_res = KeyHistoryRetention {
@@ -7517,6 +7594,114 @@ mod tests {
};
assert_eq!(res, expected_res);
// In case of branch compaction, the branch itself does not have the full history, and we need to provide
// the ancestor image in the test case.
let history = vec![
(
key,
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
),
(
key,
Lsn(0x30),
Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
),
(
key,
Lsn(0x40),
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
),
(
key,
Lsn(0x70),
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
),
];
let res = tline
.generate_key_retention(
key,
&history,
Lsn(0x60),
&[],
3,
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
)
.await
.unwrap();
let expected_res = KeyHistoryRetention {
below_horizon: vec![(
Lsn(0x60),
KeyLogAtLsn(vec![(
Lsn(0x60),
Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
)]),
)],
above_horizon: KeyLogAtLsn(vec![(
Lsn(0x70),
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
)]),
};
assert_eq!(res, expected_res);
let history = vec![
(
key,
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
),
(
key,
Lsn(0x40),
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
),
(
key,
Lsn(0x60),
Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
),
(
key,
Lsn(0x70),
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
),
];
let res = tline
.generate_key_retention(
key,
&history,
Lsn(0x60),
&[Lsn(0x30)],
3,
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
)
.await
.unwrap();
let expected_res = KeyHistoryRetention {
below_horizon: vec![
(
Lsn(0x30),
KeyLogAtLsn(vec![(
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
)]),
),
(
Lsn(0x60),
KeyLogAtLsn(vec![(
Lsn(0x60),
Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
)]),
),
],
above_horizon: KeyLogAtLsn(vec![(
Lsn(0x70),
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
)]),
};
assert_eq!(res, expected_res);
Ok(())
}
@@ -7674,6 +7859,10 @@ mod tests {
];
let verify_result = || async {
let gc_horizon = {
let gc_info = tline.gc_info.read().unwrap();
gc_info.cutoffs.time
};
for idx in 0..10 {
assert_eq!(
tline
@@ -7684,7 +7873,7 @@ mod tests {
);
assert_eq!(
tline
.get(get_key(idx as u32), Lsn(0x30), &ctx)
.get(get_key(idx as u32), gc_horizon, &ctx)
.await
.unwrap(),
&expected_result_at_gc_horizon[idx]
@@ -7710,6 +7899,205 @@ mod tests {
let cancel = CancellationToken::new();
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
verify_result().await;
// compact again
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
verify_result().await;
// increase GC horizon and compact again
{
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Lsn(0x38);
guard.cutoffs.space = Lsn(0x38);
}
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
// not increasing the GC horizon and compact again
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
verify_result().await;
Ok(())
}
#[tokio::test]
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
let (tenant, ctx) = harness.load().await;
fn get_key(id: u32) -> Key {
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
let img_layer = (0..10)
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
.collect_vec();
let delta1 = vec![
(
get_key(1),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
),
(
get_key(2),
Lsn(0x30),
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
),
(
get_key(3),
Lsn(0x28),
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
),
(
get_key(3),
Lsn(0x30),
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
),
(
get_key(3),
Lsn(0x40),
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
),
];
let delta2 = vec![
(
get_key(5),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
),
(
get_key(6),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
),
];
let delta3 = vec![
(
get_key(8),
Lsn(0x48),
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
),
(
get_key(9),
Lsn(0x48),
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
),
];
let parent_tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![], // delta layers
vec![(Lsn(0x18), img_layer)], // image layers
Lsn(0x18),
)
.await?;
parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
let branch_tline = tenant
.branch_timeline_test_with_layers(
&parent_tline,
NEW_TIMELINE_ID,
Some(Lsn(0x18)),
&ctx,
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
], // delta layers
vec![], // image layers
Lsn(0x50),
)
.await?;
branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
{
// Update GC info
let mut guard = parent_tline.gc_info.write().unwrap();
*guard = GcInfo {
retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
cutoffs: GcCutoffs {
time: Lsn(0x10),
space: Lsn(0x10),
},
leases: Default::default(),
within_ancestor_pitr: false,
};
}
{
// Update GC info
let mut guard = branch_tline.gc_info.write().unwrap();
*guard = GcInfo {
retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
cutoffs: GcCutoffs {
time: Lsn(0x50),
space: Lsn(0x50),
},
leases: Default::default(),
within_ancestor_pitr: false,
};
}
let expected_result_at_gc_horizon = [
Bytes::from_static(b"value 0@0x10"),
Bytes::from_static(b"value 1@0x10@0x20"),
Bytes::from_static(b"value 2@0x10@0x30"),
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
Bytes::from_static(b"value 4@0x10"),
Bytes::from_static(b"value 5@0x10@0x20"),
Bytes::from_static(b"value 6@0x10@0x20"),
Bytes::from_static(b"value 7@0x10"),
Bytes::from_static(b"value 8@0x10@0x48"),
Bytes::from_static(b"value 9@0x10@0x48"),
];
let expected_result_at_lsn_40 = [
Bytes::from_static(b"value 0@0x10"),
Bytes::from_static(b"value 1@0x10@0x20"),
Bytes::from_static(b"value 2@0x10@0x30"),
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
Bytes::from_static(b"value 4@0x10"),
Bytes::from_static(b"value 5@0x10@0x20"),
Bytes::from_static(b"value 6@0x10@0x20"),
Bytes::from_static(b"value 7@0x10"),
Bytes::from_static(b"value 8@0x10"),
Bytes::from_static(b"value 9@0x10"),
];
let verify_result = || async {
for idx in 0..10 {
assert_eq!(
branch_tline
.get(get_key(idx as u32), Lsn(0x50), &ctx)
.await
.unwrap(),
&expected_result_at_gc_horizon[idx]
);
assert_eq!(
branch_tline
.get(get_key(idx as u32), Lsn(0x40), &ctx)
.await
.unwrap(),
&expected_result_at_lsn_40[idx]
);
}
};
verify_result().await;
let cancel = CancellationToken::new();
branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
verify_result().await;

View File

@@ -28,6 +28,12 @@ use crate::virtual_file::VirtualFile;
use std::cmp::min;
use std::io::{Error, ErrorKind};
#[derive(Copy, Clone, Debug)]
pub struct CompressionInfo {
pub written_compressed: bool,
pub compressed_size: Option<usize>,
}
impl<'a> BlockCursor<'a> {
/// Read a blob into a new buffer.
pub async fn read_blob(
@@ -273,8 +279,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
srcbuf: B,
ctx: &RequestContext,
) -> (B::Buf, Result<u64, Error>) {
self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
.await
let (buf, res) = self
.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
.await;
(buf, res.map(|(off, _compression_info)| off))
}
/// Write a blob of data. Returns the offset that it was written to,
@@ -284,8 +292,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
srcbuf: B,
ctx: &RequestContext,
algorithm: ImageCompressionAlgorithm,
) -> (B::Buf, Result<u64, Error>) {
) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
let offset = self.offset;
let mut compression_info = CompressionInfo {
written_compressed: false,
compressed_size: None,
};
let len = srcbuf.bytes_init();
@@ -328,7 +340,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
encoder.write_all(&slice[..]).await.unwrap();
encoder.shutdown().await.unwrap();
let compressed = encoder.into_inner();
compression_info.compressed_size = Some(compressed.len());
if compressed.len() < len {
compression_info.written_compressed = true;
let compressed_len = compressed.len();
compressed_buf = Some(compressed);
(BYTE_ZSTD, compressed_len, slice.into_inner())
@@ -359,7 +373,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
} else {
self.write_all(srcbuf, ctx).await
};
(srcbuf, res.map(|_| offset))
(srcbuf, res.map(|_| (offset, compression_info)))
}
}
@@ -416,12 +430,14 @@ pub(crate) mod tests {
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
for blob in blobs.iter() {
let (_, res) = if compression {
wtr.write_blob_maybe_compressed(
blob.clone(),
ctx,
ImageCompressionAlgorithm::Zstd { level: Some(1) },
)
.await
let res = wtr
.write_blob_maybe_compressed(
blob.clone(),
ctx,
ImageCompressionAlgorithm::Zstd { level: Some(1) },
)
.await;
(res.0, res.1.map(|(off, _)| off))
} else {
wtr.write_blob(blob.clone(), ctx).await
};

View File

@@ -296,13 +296,19 @@ where
let mut stack = Vec::new();
stack.push((self.root_blk, None));
let block_cursor = self.reader.block_cursor();
let mut node_buf = [0_u8; PAGE_SZ];
while let Some((node_blknum, opt_iter)) = stack.pop() {
// Locate the node.
let node_buf = block_cursor
// Read the node, through the PS PageCache, into local variable `node_buf`.
// We could keep the page cache read guard alive, but, at the time of writing,
// we run quite small PS PageCache s => can't risk running out of
// PageCache space because this stream isn't consumed fast enough.
let page_read_guard = block_cursor
.read_blk(self.start_blk + node_blknum, ctx)
.await?;
node_buf.copy_from_slice(page_read_guard.as_ref());
drop(page_read_guard); // drop page cache read guard early
let node = OnDiskNode::deparse(node_buf.as_ref())?;
let node = OnDiskNode::deparse(&node_buf)?;
let prefix_len = node.prefix_len as usize;
let suffix_len = node.suffix_len as usize;
@@ -345,6 +351,7 @@ where
Either::Left(idx..node.num_children.into())
};
// idx points to the first match now. Keep going from there
while let Some(idx) = iter.next() {
let key_off = idx * suffix_len;

View File

@@ -51,7 +51,8 @@ use crate::keyspace::KeyPartitioning;
use crate::repository::Key;
use crate::tenant::storage_layer::InMemoryLayer;
use anyhow::Result;
use pageserver_api::keyspace::KeySpaceAccum;
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
use std::collections::{HashMap, VecDeque};
use std::iter::Peekable;
use std::ops::Range;
@@ -61,7 +62,7 @@ use utils::lsn::Lsn;
use historic_layer_coverage::BufferedHistoricLayerCoverage;
pub use historic_layer_coverage::LayerKey;
use super::storage_layer::PersistentLayerDesc;
use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
///
/// LayerMap tracks what layers exist on a timeline.
@@ -871,11 +872,183 @@ impl LayerMap {
println!("End dump LayerMap");
Ok(())
}
/// `read_points` represent the tip of a timeline and any branch points, i.e. the places
/// where we expect to serve reads.
///
/// This function is O(N) and should be called infrequently. The caller is responsible for
/// looking up and updating the Layer objects for these layer descriptors.
pub fn get_visibility(
&self,
mut read_points: Vec<Lsn>,
) -> (
Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
KeySpace,
) {
// This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
// KeySpace is intended to be composed statically and iterated over.
struct KeyShadow {
// Map of range start to range end
inner: RangeSetBlaze<i128>,
}
impl KeyShadow {
fn new() -> Self {
Self {
inner: Default::default(),
}
}
fn contains(&self, range: Range<Key>) -> bool {
let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
CheckSortedDisjoint::from([range_incl]),
))
}
/// Add the input range to the keys covered by self.
///
/// Return true if inserting this range covered some keys that were previously not covered
fn cover(&mut self, insert: Range<Key>) -> bool {
let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
self.inner.ranges_insert(range_incl)
}
fn reset(&mut self) {
self.inner = Default::default();
}
fn to_keyspace(&self) -> KeySpace {
let mut accum = KeySpaceAccum::new();
for range_incl in self.inner.ranges() {
let range = Range {
start: Key::from_i128(*range_incl.start()),
end: Key::from_i128(range_incl.end() + 1),
};
accum.add_range(range)
}
accum.to_keyspace()
}
}
// The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
// and a ReadPoint
read_points.sort_by_key(|rp| rp.0);
let mut shadow = KeyShadow::new();
// We will interleave all our read points and layers into a sorted collection
enum Item {
ReadPoint { lsn: Lsn },
Layer(Arc<PersistentLayerDesc>),
}
let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
items.extend(self.iter_historic_layers().map(Item::Layer));
items.extend(
read_points
.into_iter()
.map(|rp| Item::ReadPoint { lsn: rp }),
);
// Ordering: we want to iterate like this:
// 1. Highest LSNs first
// 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
// 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
items.sort_by_key(|item| {
std::cmp::Reverse(match item {
Item::Layer(layer) => {
if layer.is_delta() {
(Lsn(layer.get_lsn_range().end.0 - 1), 0)
} else {
(layer.image_layer_lsn(), 1)
}
}
Item::ReadPoint { lsn } => (*lsn, 2),
})
});
let mut results = Vec::with_capacity(self.historic.len());
let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
for item in items {
let (reached_lsn, is_readpoint) = match &item {
Item::ReadPoint { lsn } => (lsn, true),
Item::Layer(layer) => (&layer.lsn_range.start, false),
};
maybe_covered_deltas.retain(|d| {
if *reached_lsn >= d.lsn_range.start && is_readpoint {
// We encountered a readpoint within the delta layer: it is visible
results.push((d.clone(), LayerVisibilityHint::Visible));
false
} else if *reached_lsn < d.lsn_range.start {
// We passed the layer's range without encountering a read point: it is not visible
results.push((d.clone(), LayerVisibilityHint::Covered));
false
} else {
// We're still in the delta layer: continue iterating
true
}
});
match item {
Item::ReadPoint { lsn: _lsn } => {
// TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
// to assume that the whole key range is visible at the branch point.
shadow.reset();
}
Item::Layer(layer) => {
let visibility = if layer.is_delta() {
if shadow.contains(layer.get_key_range()) {
// If a layer isn't visible based on current state, we must defer deciding whether
// it is truly not visible until we have advanced past the delta's range: we might
// encounter another branch point within this delta layer's LSN range.
maybe_covered_deltas.push(layer);
continue;
} else {
LayerVisibilityHint::Visible
}
} else {
let modified = shadow.cover(layer.get_key_range());
if modified {
// An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
LayerVisibilityHint::Visible
} else {
// An image layer in a region that was already covered
LayerVisibilityHint::Covered
}
};
results.push((layer, visibility));
}
}
}
// Drain any remaining maybe_covered deltas
results.extend(
maybe_covered_deltas
.into_iter()
.map(|d| (d, LayerVisibilityHint::Covered)),
);
(results, shadow.to_keyspace())
}
}
#[cfg(test)]
mod tests {
use pageserver_api::keyspace::KeySpace;
use crate::tenant::{storage_layer::LayerName, IndexPart};
use pageserver_api::{
key::DBDIR_KEY,
keyspace::{KeySpace, KeySpaceRandomAccum},
};
use std::{collections::HashMap, path::PathBuf};
use utils::{
id::{TenantId, TimelineId},
shard::TenantShardId,
};
use super::*;
@@ -1002,4 +1175,299 @@ mod tests {
}
}
}
#[test]
fn layer_visibility_basic() {
// A simple synthetic input, as a smoke test.
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
let timeline_id = TimelineId::generate();
let mut layer_map = LayerMap::default();
let mut updates = layer_map.batch_update();
const FAKE_LAYER_SIZE: u64 = 1024;
let inject_delta = |updates: &mut BatchedUpdates,
key_start: i128,
key_end: i128,
lsn_start: u64,
lsn_end: u64| {
let desc = PersistentLayerDesc::new_delta(
tenant_shard_id,
timeline_id,
Range {
start: Key::from_i128(key_start),
end: Key::from_i128(key_end),
},
Range {
start: Lsn(lsn_start),
end: Lsn(lsn_end),
},
1024,
);
updates.insert_historic(desc.clone());
desc
};
let inject_image =
|updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
let desc = PersistentLayerDesc::new_img(
tenant_shard_id,
timeline_id,
Range {
start: Key::from_i128(key_start),
end: Key::from_i128(key_end),
},
Lsn(lsn),
FAKE_LAYER_SIZE,
);
updates.insert_historic(desc.clone());
desc
};
//
// Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
// we expect to handle. You can follow these examples through in the same order as they would be processed
// by the function under test.
//
let mut read_points = vec![Lsn(1000)];
// A delta ahead of any image layer
let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
// An image layer is visible and covers some layers beneath itself
let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
// A delta layer covered by the image layer: should be covered
let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
// A delta layer partially covered by an image layer: should be visible
let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
// A delta layer not covered by an image layer: should be visible
let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
// An image layer covered by the image layer above: should be covered
let covered_image = inject_image(&mut updates, 10, 20, 89);
// An image layer partially covered by an image layer: should be visible
let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
// An image layer not covered by an image layer: should be visible
let not_covered_image = inject_image(&mut updates, 1, 4, 89);
// A read point: this will make subsequent layers below here visible, even if there are
// more recent layers covering them.
read_points.push(Lsn(80));
// A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
// A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
// the read point should make it visible, even though its end LSN is covered
let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
read_points.push(Lsn(65));
let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
updates.flush();
let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
assert_eq!(
layer_visibilities.get(&ahead_layer),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&visible_covering_img),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&covered_delta),
Some(&LayerVisibilityHint::Covered)
);
assert_eq!(
layer_visibilities.get(&partially_covered_delta),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&not_covered_delta),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&covered_image),
Some(&LayerVisibilityHint::Covered)
);
assert_eq!(
layer_visibilities.get(&partially_covered_image),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&not_covered_image),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&covered_delta_below_read_point),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&covering_img_between_read_points),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&covered_delta_between_read_points),
Some(&LayerVisibilityHint::Covered)
);
assert_eq!(
layer_visibilities.get(&covered_delta_intersects_read_point),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&visible_img_after_last_read_point),
Some(&LayerVisibilityHint::Visible)
);
// Shadow should include all the images below the last read point
let expected_shadow = KeySpace {
ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
};
assert_eq!(shadow, expected_shadow);
}
fn fixture_path(relative: &str) -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
}
#[test]
fn layer_visibility_realistic() {
// Load a large example layermap
let index_raw = std::fs::read_to_string(fixture_path(
"test_data/indices/mixed_workload/index_part.json",
))
.unwrap();
let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
let tenant_id = TenantId::generate();
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let timeline_id = TimelineId::generate();
let mut layer_map = LayerMap::default();
let mut updates = layer_map.batch_update();
for (layer_name, layer_metadata) in index.layer_metadata {
let layer_desc = match layer_name {
LayerName::Image(layer_name) => PersistentLayerDesc {
key_range: layer_name.key_range.clone(),
lsn_range: layer_name.lsn_as_range(),
tenant_shard_id,
timeline_id,
is_delta: false,
file_size: layer_metadata.file_size,
},
LayerName::Delta(layer_name) => PersistentLayerDesc {
key_range: layer_name.key_range,
lsn_range: layer_name.lsn_range,
tenant_shard_id,
timeline_id,
is_delta: true,
file_size: layer_metadata.file_size,
},
};
updates.insert_historic(layer_desc);
}
updates.flush();
let read_points = vec![index.metadata.disk_consistent_lsn()];
let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
for (layer_desc, visibility) in &layer_visibilities {
tracing::info!("{layer_desc:?}: {visibility:?}");
eprintln!("{layer_desc:?}: {visibility:?}");
}
// The shadow should be non-empty, since there were some image layers
assert!(!shadow.ranges.is_empty());
// At least some layers should be marked covered
assert!(layer_visibilities
.iter()
.any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
// Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
for (layer_desc, visible) in &layer_visibilities {
let mut coverage = KeySpaceRandomAccum::new();
let mut covered_by = Vec::new();
for other_layer in layer_map.iter_historic_layers() {
if &other_layer == layer_desc {
continue;
}
if !other_layer.is_delta()
&& other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
&& other_layer.key_range.start <= layer_desc.key_range.end
&& layer_desc.key_range.start <= other_layer.key_range.end
{
coverage.add_range(other_layer.get_key_range());
covered_by.push((*other_layer).clone());
}
}
let coverage = coverage.to_keyspace();
let expect_visible = if coverage.ranges.len() == 1
&& coverage.contains(&layer_desc.key_range.start)
&& coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
{
LayerVisibilityHint::Covered
} else {
LayerVisibilityHint::Visible
};
if expect_visible != *visible {
eprintln!(
"Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
layer_desc.key_range.start,
layer_desc.key_range.end,
layer_desc.lsn_range.start,
layer_desc.lsn_range.end,
layer_desc.is_delta()
);
if expect_visible == LayerVisibilityHint::Covered {
eprintln!("Covered by:");
for other in covered_by {
eprintln!(
" {}..{} @ {}",
other.get_key_range().start,
other.get_key_range().end,
other.image_layer_lsn()
);
}
if let Some(range) = coverage.ranges.first() {
eprintln!(
"Total coverage from contributing layers: {}..{}",
range.start, range.end
);
} else {
eprintln!(
"Total coverage from contributing layers: {:?}",
coverage.ranges
);
}
}
}
assert_eq!(expect_visible, *visible);
}
// Sanity: the layer that holds latest data for the DBDIR key should always be visible
// (just using this key as a key that will always exist for any layermap fixture)
let dbdir_layer = layer_map
.search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
.unwrap();
assert!(matches!(
layer_visibilities.get(&dbdir_layer.layer).unwrap(),
LayerVisibilityHint::Visible
));
}
}

View File

@@ -521,6 +521,10 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
Ok(&self.historic_coverage)
}
pub(crate) fn len(&self) -> usize {
self.layers.len()
}
}
#[test]

View File

@@ -55,7 +55,7 @@ use utils::id::{TenantId, TimelineId};
use super::remote_timeline_client::remote_tenant_path;
use super::secondary::SecondaryTenant;
use super::timeline::detach_ancestor::PreparedTimelineDetach;
use super::TenantSharedResources;
use super::{GlobalShutDown, TenantSharedResources};
/// For a tenant that appears in TenantsMap, it may either be
/// - `Attached`: has a full Tenant object, is elegible to service
@@ -116,8 +116,6 @@ pub(crate) enum ShardSelector {
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
/// ignore it.
Zero,
/// Pick the first shard we find for the TenantId
First,
/// Pick the shard that holds this key
Page(Key),
/// The shard ID is known: pick the given shard
@@ -667,17 +665,20 @@ pub async fn init_tenant_mgr(
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
let shard_identity = location_conf.shard;
let slot = match location_conf.mode {
LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
conf,
tenant_shard_id,
&tenant_dir_path,
resources.clone(),
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
shard_identity,
Some(init_order.clone()),
SpawnMode::Lazy,
&ctx,
)),
LocationMode::Attached(attached_conf) => TenantSlot::Attached(
tenant_spawn(
conf,
tenant_shard_id,
&tenant_dir_path,
resources.clone(),
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
shard_identity,
Some(init_order.clone()),
SpawnMode::Lazy,
&ctx,
)
.expect("global shutdown during init_tenant_mgr cannot happen"),
),
LocationMode::Secondary(secondary_conf) => {
info!(
tenant_id = %tenant_shard_id.tenant_id,
@@ -725,7 +726,7 @@ fn tenant_spawn(
init_order: Option<InitializationOrder>,
mode: SpawnMode,
ctx: &RequestContext,
) -> Arc<Tenant> {
) -> Result<Arc<Tenant>, GlobalShutDown> {
// All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
// path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode
// to avoid impacting prod runtime performance.
@@ -1192,7 +1193,10 @@ impl TenantManager {
None,
spawn_mode,
ctx,
);
)
.map_err(|_: GlobalShutDown| {
UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
})?;
TenantSlot::Attached(tenant)
}
@@ -1313,7 +1317,7 @@ impl TenantManager {
None,
SpawnMode::Eager,
ctx,
);
)?;
slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -1384,34 +1388,32 @@ impl TenantManager {
tenant_shard_id: TenantShardId,
) -> Result<(), DeleteTenantError> {
let remote_path = remote_tenant_path(&tenant_shard_id);
let keys = match self
.resources
.remote_storage
.list(
Some(&remote_path),
remote_storage::ListingMode::NoDelimiter,
None,
&self.cancel,
)
.await
{
Ok(listing) => listing.keys,
Err(remote_storage::DownloadError::Cancelled) => {
return Err(DeleteTenantError::Cancelled)
}
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
};
let mut keys_stream = self.resources.remote_storage.list_streaming(
Some(&remote_path),
remote_storage::ListingMode::NoDelimiter,
None,
&self.cancel,
);
while let Some(chunk) = keys_stream.next().await {
let keys = match chunk {
Ok(listing) => listing.keys,
Err(remote_storage::DownloadError::Cancelled) => {
return Err(DeleteTenantError::Cancelled)
}
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
};
if keys.is_empty() {
tracing::info!("Remote storage already deleted");
} else {
tracing::info!("Deleting {} keys from remote storage", keys.len());
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
self.resources
.remote_storage
.delete_objects(&keys, &self.cancel)
.await?;
if keys.is_empty() {
tracing::info!("Remote storage already deleted");
} else {
tracing::info!("Deleting {} keys from remote storage", keys.len());
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
self.resources
.remote_storage
.delete_objects(&keys, &self.cancel)
.await?;
}
}
Ok(())
@@ -2049,7 +2051,7 @@ impl TenantManager {
None,
SpawnMode::Eager,
ctx,
);
)?;
slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -2090,7 +2092,6 @@ impl TenantManager {
};
match selector {
ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
return ShardResolveResult::Found(tenant.clone())
}
@@ -2172,6 +2173,9 @@ pub(crate) enum GetActiveTenantError {
/// never happen.
#[error("Tenant is broken: {0}")]
Broken(String),
#[error("reconnect to switch tenant id")]
SwitchedTenant,
}
#[derive(Debug, thiserror::Error)]

View File

@@ -1378,6 +1378,18 @@ impl RemoteTimelineClient {
.dirty
.layer_metadata
.drain()
.filter(|(_file_name, meta)| {
// Filter out layers that belonged to an ancestor shard. Since we are deleting the whole timeline from
// all shards anyway, we _could_ delete these, but
// - it creates a potential race if other shards are still
// using the layers while this shard deletes them.
// - it means that if we rolled back the shard split, the ancestor shards would be in a state where
// these timelines are present but corrupt (their index exists but some layers don't)
//
// These layers will eventually be cleaned up by the scrubber when it does physical GC.
meta.shard.shard_number == self.tenant_shard_id.shard_number
&& meta.shard.shard_count == self.tenant_shard_id.shard_count
})
.map(|(file_name, meta)| {
remote_layer_path(
&self.tenant_shard_id.tenant_id,

View File

@@ -8,6 +8,9 @@ mod layer_desc;
mod layer_name;
pub mod merge_iterator;
#[cfg(test)]
pub mod split_writer;
use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value;
use crate::walrecord::NeonWalRecord;
@@ -451,20 +454,14 @@ pub enum ValueReconstructResult {
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should
/// be used for cache management but not for correctness-critical checks.
#[derive(Default, Debug, Clone, PartialEq, Eq)]
pub(crate) enum LayerVisibilityHint {
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum LayerVisibilityHint {
/// A Visible layer might be read while serving a read, because there is not an image layer between it
/// and a readable LSN (the tip of the branch or a child's branch point)
Visible,
/// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
/// a branch or ephemeral endpoint at an LSN below the layer that covers this.
#[allow(unused)]
Covered,
/// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
/// in this state. Note that newly written layers may be called Visible immediately, this uninitialized
/// state is for when existing layers are constructed while loading a timeline.
#[default]
Uninitialized,
}
pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
@@ -626,23 +623,30 @@ impl LayerAccessStats {
}
}
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
let value = match visibility {
LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
};
self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
}
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
/// Helper for extracting the visibility hint from the literal value of our inner u64
fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
1 => LayerVisibilityHint::Visible,
0 => LayerVisibilityHint::Covered,
_ => unreachable!(),
}
}
/// Returns the old value which has been replaced
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
let value = match visibility {
LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
LayerVisibilityHint::Covered => 0x0,
};
let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
self.decode_visibility(old_bits)
}
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
self.decode_visibility(read)
}
}
/// Get a layer descriptor from a layer.

View File

@@ -467,7 +467,7 @@ impl DeltaLayerWriterInner {
.write_blob_maybe_compressed(val, ctx, compression)
.await;
let off = match res {
Ok(off) => off,
Ok((off, _)) => off,
Err(e) => return (val, Err(anyhow::anyhow!(e))),
};

View File

@@ -734,8 +734,22 @@ struct ImageLayerWriterInner {
// Total uncompressed bytes passed into put_image
uncompressed_bytes: u64,
// Like `uncompressed_bytes`,
// but only of images we might consider for compression
uncompressed_bytes_eligible: u64,
// Like `uncompressed_bytes`, but only of images
// where we have chosen their compressed form
uncompressed_bytes_chosen: u64,
// Number of keys in the layer.
num_keys: usize,
blob_writer: BlobWriter<false>,
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
#[cfg_attr(not(feature = "testing"), allow(dead_code))]
last_written_key: Key,
}
impl ImageLayerWriterInner {
@@ -790,6 +804,10 @@ impl ImageLayerWriterInner {
tree: tree_builder,
blob_writer,
uncompressed_bytes: 0,
uncompressed_bytes_eligible: 0,
uncompressed_bytes_chosen: 0,
num_keys: 0,
last_written_key: Key::MIN,
};
Ok(writer)
@@ -808,18 +826,33 @@ impl ImageLayerWriterInner {
) -> anyhow::Result<()> {
ensure!(self.key_range.contains(&key));
let compression = self.conf.image_compression;
self.uncompressed_bytes += img.len() as u64;
let uncompressed_len = img.len() as u64;
self.uncompressed_bytes += uncompressed_len;
self.num_keys += 1;
let (_img, res) = self
.blob_writer
.write_blob_maybe_compressed(img, ctx, compression)
.await;
// TODO: re-use the buffer for `img` further upstack
let off = res?;
let (off, compression_info) = res?;
if compression_info.compressed_size.is_some() {
// The image has been considered for compression at least
self.uncompressed_bytes_eligible += uncompressed_len;
}
if compression_info.written_compressed {
// The image has been compressed
self.uncompressed_bytes_chosen += uncompressed_len;
}
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
self.tree.append(&keybuf, off)?;
#[cfg(feature = "testing")]
{
self.last_written_key = key;
}
Ok(())
}
@@ -830,6 +863,7 @@ impl ImageLayerWriterInner {
self,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
end_key: Option<Key>,
) -> anyhow::Result<ResidentLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -837,6 +871,9 @@ impl ImageLayerWriterInner {
// Calculate compression ratio
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
.inc_by(self.uncompressed_bytes_eligible);
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
let mut file = self.blob_writer.into_inner();
@@ -877,11 +914,23 @@ impl ImageLayerWriterInner {
let desc = PersistentLayerDesc::new_img(
self.tenant_shard_id,
self.timeline_id,
self.key_range.clone(),
if let Some(end_key) = end_key {
self.key_range.start..end_key
} else {
self.key_range.clone()
},
self.lsn,
metadata.len(),
);
#[cfg(feature = "testing")]
if let Some(end_key) = end_key {
assert!(
self.last_written_key < end_key,
"written key violates end_key range"
);
}
// Note: Because we open the file in write-only mode, we cannot
// reuse the same VirtualFile for reading later. That's why we don't
// set inner.file here. The first read will have to re-open it.
@@ -958,6 +1007,18 @@ impl ImageLayerWriter {
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
}
#[cfg(test)]
/// Estimated size of the image layer.
pub(crate) fn estimated_size(&self) -> u64 {
let inner = self.inner.as_ref().unwrap();
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
}
#[cfg(test)]
pub(crate) fn num_keys(&self) -> usize {
self.inner.as_ref().unwrap().num_keys
}
///
/// Finish writing the image layer.
///
@@ -966,7 +1027,22 @@ impl ImageLayerWriter {
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<super::ResidentLayer> {
self.inner.take().unwrap().finish(timeline, ctx).await
self.inner.take().unwrap().finish(timeline, ctx, None).await
}
#[cfg(test)]
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
pub(super) async fn finish_with_end_key(
mut self,
timeline: &Arc<Timeline>,
end_key: Key,
ctx: &RequestContext,
) -> anyhow::Result<super::ResidentLayer> {
self.inner
.take()
.unwrap()
.finish(timeline, ctx, Some(end_key))
.await
}
}

View File

@@ -24,7 +24,8 @@ use super::delta_layer::{self, DeltaEntry};
use super::image_layer::{self};
use super::{
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
ValuesReconstructState,
};
use utils::generation::Generation;
@@ -246,7 +247,7 @@ impl Layer {
&timeline.generation,
);
let layer = LayerInner::new(
LayerInner::new(
conf,
timeline,
local_path,
@@ -254,14 +255,7 @@ impl Layer {
Some(inner),
timeline.generation,
timeline.get_shard_index(),
);
// Newly created layers are marked visible by default: the usual case is that they were created to be read.
layer
.access_stats
.set_visibility(super::LayerVisibilityHint::Visible);
layer
)
}));
let downloaded = resident.expect("just initialized");
@@ -493,6 +487,32 @@ impl Layer {
}
}
}
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
let old_visibility = self.access_stats().set_visibility(visibility.clone());
use LayerVisibilityHint::*;
match (old_visibility, visibility) {
(Visible, Covered) => {
// Subtract this layer's contribution to the visible size metric
if let Some(tl) = self.0.timeline.upgrade() {
tl.metrics
.visible_physical_size_gauge
.sub(self.0.desc.file_size)
}
}
(Covered, Visible) => {
// Add this layer's contribution to the visible size metric
if let Some(tl) = self.0.timeline.upgrade() {
tl.metrics
.visible_physical_size_gauge
.add(self.0.desc.file_size)
}
}
(Covered, Covered) | (Visible, Visible) => {
// no change
}
}
}
}
/// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
@@ -693,6 +713,13 @@ impl Drop for LayerInner {
timeline.metrics.layer_count_image.dec();
timeline.metrics.layer_size_image.sub(self.desc.file_size);
}
if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
timeline
.metrics
.visible_physical_size_gauge
.sub(self.desc.file_size);
}
}
if !*self.wanted_deleted.get_mut() {
@@ -801,6 +828,12 @@ impl LayerInner {
timeline.metrics.layer_size_image.add(desc.file_size);
}
// New layers are visible by default. This metric is later updated on drop or in set_visibility
timeline
.metrics
.visible_physical_size_gauge
.add(desc.file_size);
LayerInner {
conf,
debug_str: {

View File

@@ -41,6 +41,20 @@ pub struct PersistentLayerKey {
pub is_delta: bool,
}
impl std::fmt::Display for PersistentLayerKey {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}..{} {}..{} is_delta={}",
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end,
self.is_delta
)
}
}
impl PersistentLayerDesc {
pub fn key(&self) -> PersistentLayerKey {
PersistentLayerKey {

View File

@@ -0,0 +1,244 @@
use std::sync::Arc;
use bytes::Bytes;
use pageserver_api::key::{Key, KEY_SIZE};
use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
use super::{ImageLayerWriter, ResidentLayer};
/// An image writer that takes images and produces multiple image layers. The interface does not
/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
/// to be cleaned up)
#[must_use]
pub struct SplitImageLayerWriter {
inner: ImageLayerWriter,
target_layer_size: u64,
generated_layers: Vec<ResidentLayer>,
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
lsn: Lsn,
}
impl SplitImageLayerWriter {
pub async fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
start_key: Key,
lsn: Lsn,
target_layer_size: u64,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
Ok(Self {
target_layer_size,
inner: ImageLayerWriter::new(
conf,
timeline_id,
tenant_shard_id,
&(start_key..Key::MAX),
lsn,
ctx,
)
.await?,
generated_layers: Vec::new(),
conf,
timeline_id,
tenant_shard_id,
lsn,
})
}
pub async fn put_image(
&mut self,
key: Key,
img: Bytes,
tline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
// The current estimation is an upper bound of the space that the key/image could take
// because we did not consider compression in this estimation. The resulting image layer
// could be smaller than the target size.
let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
if self.inner.num_keys() >= 1
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
{
let next_image_writer = ImageLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
&(key..Key::MAX),
self.lsn,
ctx,
)
.await?;
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
self.generated_layers.push(
prev_image_writer
.finish_with_end_key(tline, key, ctx)
.await?,
);
}
self.inner.put_image(key, img, ctx).await
}
pub(crate) async fn finish(
self,
tline: &Arc<Timeline>,
ctx: &RequestContext,
end_key: Key,
) -> anyhow::Result<Vec<ResidentLayer>> {
let Self {
mut generated_layers,
inner,
..
} = self;
generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
Ok(generated_layers)
}
}
#[cfg(test)]
mod tests {
use crate::{
tenant::{
harness::{TenantHarness, TIMELINE_ID},
storage_layer::AsLayerDesc,
},
DEFAULT_PG_VERSION,
};
use super::*;
fn get_key(id: u32) -> Key {
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
fn get_img(id: u32) -> Bytes {
format!("{id:064}").into()
}
fn get_large_img() -> Bytes {
vec![0; 8192].into()
}
#[tokio::test]
async fn write_one_image() {
let harness = TenantHarness::create("split_writer_write_one_image")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
let mut writer = SplitImageLayerWriter::new(
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18),
4 * 1024 * 1024,
&ctx,
)
.await
.unwrap();
writer
.put_image(get_key(0), get_img(0), &tline, &ctx)
.await
.unwrap();
let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
assert_eq!(layers.len(), 1);
}
#[tokio::test]
async fn write_split() {
let harness = TenantHarness::create("split_writer_write_split")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
let mut writer = SplitImageLayerWriter::new(
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18),
4 * 1024 * 1024,
&ctx,
)
.await
.unwrap();
const N: usize = 2000;
for i in 0..N {
let i = i as u32;
writer
.put_image(get_key(i), get_large_img(), &tline, &ctx)
.await
.unwrap();
}
let layers = writer
.finish(&tline, &ctx, get_key(N as u32))
.await
.unwrap();
assert_eq!(layers.len(), N / 512 + 1);
for idx in 0..layers.len() {
assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN);
assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX);
if idx > 0 {
assert_eq!(
layers[idx - 1].layer_desc().key_range.end,
layers[idx].layer_desc().key_range.start
);
}
}
}
#[tokio::test]
async fn write_large_img() {
let harness = TenantHarness::create("split_writer_write_large_img")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
let mut writer = SplitImageLayerWriter::new(
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18),
4 * 1024,
&ctx,
)
.await
.unwrap();
writer
.put_image(get_key(0), get_img(0), &tline, &ctx)
.await
.unwrap();
writer
.put_image(get_key(1), get_large_img(), &tline, &ctx)
.await
.unwrap();
let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
assert_eq!(layers.len(), 2);
}
}

View File

@@ -3,6 +3,7 @@ pub(crate) mod compaction;
pub mod delete;
pub(crate) mod detach_ancestor;
mod eviction_task;
pub(crate) mod handle;
mod init;
pub mod layer_manager;
pub(crate) mod logical_size;
@@ -17,6 +18,7 @@ use camino::Utf8Path;
use chrono::{DateTime, Utc};
use enumset::EnumSet;
use fail::fail_point;
use handle::ShardTimelineId;
use once_cell::sync::Lazy;
use pageserver_api::{
key::{
@@ -58,7 +60,7 @@ use std::{
sync::atomic::AtomicU64,
};
use std::{
cmp::{max, min, Ordering},
cmp::{max, min},
ops::ControlFlow,
};
use std::{
@@ -74,6 +76,7 @@ use crate::{
metadata::TimelineMetadata,
storage_layer::PersistentLayerDesc,
},
walredo,
};
use crate::{
context::{DownloadBehavior, RequestContext},
@@ -140,7 +143,10 @@ use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::{config::TenantConf, upload_queue::NotInitialized};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
use super::{
remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
storage_layer::ReadableLayer,
};
use super::{
secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
GcError,
@@ -177,25 +183,6 @@ impl std::fmt::Display for ImageLayerCreationMode {
}
}
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct Hole {
key_range: Range<Key>,
coverage_size: usize,
}
impl Ord for Hole {
fn cmp(&self, other: &Self) -> Ordering {
other.coverage_size.cmp(&self.coverage_size) // inverse order
}
}
impl PartialOrd for Hole {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
/// Can be removed after all refactors are done.
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
@@ -443,6 +430,8 @@ pub struct Timeline {
pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
pub(crate) l0_flush_global_state: L0FlushGlobalState,
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
}
pub struct WalReceiverInfo {
@@ -548,7 +537,6 @@ impl GetVectoredError {
}
}
#[derive(Debug)]
pub struct MissingKeyError {
key: Key,
shard: ShardNumber,
@@ -559,6 +547,12 @@ pub struct MissingKeyError {
backtrace: Option<std::backtrace::Backtrace>,
}
impl std::fmt::Debug for MissingKeyError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self)
}
}
impl std::fmt::Display for MissingKeyError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
@@ -1010,7 +1004,10 @@ impl Timeline {
.for_get_kind(GetKind::Singular)
.observe(elapsed.as_secs_f64());
if cfg!(feature = "testing") && res.is_err() {
if cfg!(feature = "testing")
&& res.is_err()
&& !matches!(res, Err(PageReconstructError::Cancelled))
{
// it can only be walredo issue
use std::fmt::Write;
@@ -1929,6 +1926,9 @@ impl Timeline {
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
// Ensure Prevent new page service requests from starting.
self.handles.shutdown();
// Transition the remote_client into a state where it's only useful for timeline deletion.
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
self.remote_client.stop();
@@ -2454,6 +2454,8 @@ impl Timeline {
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
l0_flush_global_state: resources.l0_flush_global_state,
handles: Default::default(),
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2737,6 +2739,10 @@ impl Timeline {
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
// Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
drop(guard); // drop write lock, update_layer_visibility will take a read lock.
self.update_layer_visibility().await;
info!(
"loaded layer map with {} layers at {}, total physical size: {}",
num_layers, disk_consistent_lsn, total_physical_size
@@ -3723,6 +3729,17 @@ impl Timeline {
&self.shard_identity
}
#[inline(always)]
pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
ShardTimelineId {
shard_index: ShardIndex {
shard_number: self.shard_identity.number,
shard_count: self.shard_identity.count,
},
timeline_id: self.timeline_id,
}
}
///
/// Get a handle to the latest layer for appending.
///
@@ -4075,6 +4092,21 @@ impl Timeline {
// release lock on 'layers'
};
// Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
// This makes us refuse ingest until the new layers have been persisted to the remote.
self.remote_client
.wait_completion()
.await
.map_err(|e| match e {
WaitCompletionError::UploadQueueShutDownOrStopped
| WaitCompletionError::NotInitialized(
NotInitialized::ShuttingDown | NotInitialized::Stopped,
) => FlushLayerError::Cancelled,
WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
FlushLayerError::Other(anyhow!(e).into())
}
})?;
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
// a compaction can delete the file and then it won't be available for uploads any more.
// We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
@@ -4667,27 +4699,6 @@ impl Timeline {
}
}
// The writer.finish() above already did the fsync of the inodes.
// We just need to fsync the directory in which these inodes are linked,
// which we know to be the timeline directory.
if !image_layers.is_empty() {
// We use fatal_err() below because the after writer.finish() returns with success,
// the in-memory state of the filesystem already has the layer file in its final place,
// and subsequent pageserver code could think it's durable while it really isn't.
let timeline_dir = VirtualFile::open(
&self
.conf
.timeline_path(&self.tenant_shard_id, &self.timeline_id),
ctx,
)
.await
.fatal_err("VirtualFile::open for timeline dir fsync");
timeline_dir
.sync_all()
.await
.fatal_err("VirtualFile::sync_all timeline dir");
}
let mut guard = self.layers.write().await;
// FIXME: we could add the images to be uploaded *before* returning from here, but right
@@ -4696,6 +4707,9 @@ impl Timeline {
drop_wlock(guard);
timer.stop_and_record();
// Creating image layers may have caused some previously visible layers to be covered
self.update_layer_visibility().await;
Ok(image_layers)
}
@@ -5460,20 +5474,22 @@ impl Timeline {
} else {
trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
};
let img = match self
let res = self
.walredo_mgr
.as_ref()
.context("timeline has no walredo manager")
.map_err(PageReconstructError::WalRedo)?
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
.await
.context("reconstruct a page image")
{
.await;
let img = match res {
Ok(img) => img,
Err(e) => return Err(PageReconstructError::WalRedo(e)),
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
Err(walredo::Error::Other(e)) => {
return Err(PageReconstructError::WalRedo(
e.context("reconstruct a page image"),
))
}
};
Ok(img)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -63,10 +63,19 @@ pub(super) async fn delete_local_timeline_directory(
tenant_shard_id: TenantShardId,
timeline: &Timeline,
) -> anyhow::Result<()> {
let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
let guards = crate::timed(
guards,
"acquire gc and compaction locks",
// Always ensure the lock order is compaction -> gc.
let compaction_lock = timeline.compaction_lock.lock();
let compaction_lock = crate::timed(
compaction_lock,
"acquires compaction lock",
std::time::Duration::from_secs(5),
)
.await;
let gc_lock = timeline.gc_lock.lock();
let gc_lock = crate::timed(
gc_lock,
"acquires gc lock",
std::time::Duration::from_secs(5),
)
.await;
@@ -107,7 +116,8 @@ pub(super) async fn delete_local_timeline_directory(
.context("fsync_pre_mark_remove")?;
info!("finished deleting layer files, releasing locks");
drop(guards);
drop(gc_lock);
drop(compaction_lock);
fail::fail_point!("timeline-delete-after-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
@@ -206,11 +216,10 @@ impl DeleteTimelineFlow {
// NB: If this fails half-way through, and is retried, the retry will go through
// all the same steps again. Make sure the code here is idempotent, and don't
// error out if some of the shutdown tasks have already been completed!
#[instrument(skip_all, fields(%inplace))]
#[instrument(skip_all)]
pub async fn run(
tenant: &Arc<Tenant>,
timeline_id: TimelineId,
inplace: bool,
) -> Result<(), DeleteTimelineError> {
super::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -235,11 +244,7 @@ impl DeleteTimelineFlow {
))?
});
if inplace {
Self::background(guard, tenant.conf, tenant, &timeline).await?
} else {
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
}
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
Ok(())
}

View File

@@ -0,0 +1,967 @@
//! An efficient way to keep the timeline gate open without preventing
//! timeline shutdown for longer than a single call to a timeline method.
//!
//! # Motivation
//!
//! On a single page service connection, we're typically serving a single TenantTimelineId.
//!
//! Without sharding, there is a single Timeline object to which we dispatch
//! all requests. For example, a getpage request gets dispatched to the
//! Timeline::get method of the Timeline object that represents the
//! (tenant,timeline) of that connection.
//!
//! With sharding, for each request that comes in on the connection,
//! we first have to perform shard routing based on the requested key (=~ page number).
//! The result of shard routing is a Timeline object.
//! We then dispatch the request to that Timeline object.
//!
//! Regardless of whether the tenant is sharded or not, we want to ensure that
//! we hold the Timeline gate open while we're invoking the method on the
//! Timeline object.
//!
//! However, we want to avoid the overhead of entering the gate for every
//! method invocation.
//!
//! Further, for shard routing, we want to avoid calling the tenant manager to
//! resolve the shard for every request. Instead, we want to cache the
//! routing result so we can bypass the tenant manager for all subsequent requests
//! that get routed to that shard.
//!
//! Regardless of how we accomplish the above, it should not
//! prevent the Timeline from shutting down promptly.
//!
//! # Design
//!
//! There are three user-facing data structures:
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
//!
//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
//!
//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
//!
//! To dispatch a request, the page service connection calls `Cache::get`.
//!
//! A cache miss means we consult the tenant manager for shard routing,
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
//! and the `Arc<HandleInner>` in the `PerTimelineState`.
//!
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
//! and find the `Weak<HandleInner>` in the cache.
//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
//!
//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
//!
//! # Memory Management / How The Reference Cycle Is Broken
//!
//! The attentive reader may have noticed the strong reference cycle
//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
//!
//! This cycle is intentional: while it exists, the `Cache` can upgrade its
//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
//!
//! The cycle is broken by either
//! - `PerTimelineState::shutdown` or
//! - dropping the `Cache`.
//!
//! Concurrently existing `Handle`s will extend the existence of the cycle.
//! However, since `Handle`s are short-lived and new `Handle`s are not
//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
//! that extension of the cycle is bounded.
//!
//! # Fast Path for Shard Routing
//!
//! The `Cache` has a fast path for shard routing to avoid calling into
//! the tenant manager for every request.
//!
//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
//!
//! The current implementation uses the first entry in the hash map
//! to determine the `ShardParameters` and derive the correct
//! `ShardIndex` for the requested key.
//!
//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
//!
//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
//! it's a hit.
//!
//! ## Cache invalidation
//!
//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
//! The only reasons why an entry in the cache can become stale are:
//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
//! being detached, timeline or shard deleted, or pageserver is shutting down.
//! 2. We're doing a shard split and new traffic should be routed to the child shards.
//!
//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
//! timeline has shut down, and when that happens, we remove the entry from the cache.
//!
//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
//! to the parent shard during a shard split. Eventually, the shard split task will
//! shut down the parent => case (1).
use std::collections::hash_map;
use std::collections::HashMap;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::sync::Mutex;
use std::sync::Weak;
use pageserver_api::shard::ShardIdentity;
use tracing::instrument;
use tracing::trace;
use utils::id::TimelineId;
use utils::shard::ShardIndex;
use utils::shard::ShardNumber;
use crate::tenant::mgr::ShardSelector;
/// The requirement for Debug is so that #[derive(Debug)] works in some places.
pub(crate) trait Types: Sized + std::fmt::Debug {
type TenantManagerError: Sized + std::fmt::Debug;
type TenantManager: TenantManager<Self> + Sized;
type Timeline: ArcTimeline<Self> + Sized;
}
/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
struct CacheId(u64);
impl CacheId {
fn next() -> Self {
static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
if id == 0 {
panic!("CacheId::new() returned 0, overflow");
}
Self(id)
}
}
/// See module-level comment.
pub(crate) struct Cache<T: Types> {
id: CacheId,
map: Map<T>,
}
type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
impl<T: Types> Default for Cache<T> {
fn default() -> Self {
Self {
id: CacheId::next(),
map: Default::default(),
}
}
}
#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
pub(crate) struct ShardTimelineId {
pub(crate) shard_index: ShardIndex,
pub(crate) timeline_id: TimelineId,
}
/// See module-level comment.
pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
struct HandleInner<T: Types> {
shut_down: AtomicBool,
timeline: T::Timeline,
// The timeline's gate held open.
_gate_guard: utils::sync::gate::GateGuard,
}
/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
///
/// See module-level comment for details.
pub struct PerTimelineState<T: Types> {
// None = shutting down
handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
}
impl<T: Types> Default for PerTimelineState<T> {
fn default() -> Self {
Self {
handles: Mutex::new(Some(Default::default())),
}
}
}
/// Abstract view of [`crate::tenant::mgr`], for testability.
pub(crate) trait TenantManager<T: Types> {
/// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
/// Errors are returned as [`GetError::TenantManager`].
async fn resolve(
&self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
) -> Result<T::Timeline, T::TenantManagerError>;
}
/// Abstract view of an [`Arc<Timeline>`], for testability.
pub(crate) trait ArcTimeline<T: Types>: Clone {
fn gate(&self) -> &utils::sync::gate::Gate;
fn shard_timeline_id(&self) -> ShardTimelineId;
fn get_shard_identity(&self) -> &ShardIdentity;
fn per_timeline_state(&self) -> &PerTimelineState<T>;
}
/// Errors returned by [`Cache::get`].
#[derive(Debug)]
pub(crate) enum GetError<T: Types> {
TenantManager(T::TenantManagerError),
TimelineGateClosed,
PerTimelineStateShutDown,
}
/// Internal type used in [`Cache::get`].
enum RoutingResult<T: Types> {
FastPath(Handle<T>),
SlowPath(ShardTimelineId),
NeedConsultTenantManager,
}
impl<T: Types> Cache<T> {
/// See module-level comment for details.
///
/// Does NOT check for the shutdown state of [`Types::Timeline`].
/// Instead, the methods of [`Types::Timeline`] that are invoked through
/// the [`Handle`] are responsible for checking these conditions
/// and if so, return an error that causes the page service to
/// close the connection.
#[instrument(level = "trace", skip_all)]
pub(crate) async fn get(
&mut self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
tenant_manager: &T::TenantManager,
) -> Result<Handle<T>, GetError<T>> {
// terminates because each iteration removes an element from the map
loop {
let handle = self
.get_impl(timeline_id, shard_selector, tenant_manager)
.await?;
if handle.0.shut_down.load(Ordering::Relaxed) {
let removed = self
.map
.remove(&handle.0.timeline.shard_timeline_id())
.expect("invariant of get_impl is that the returned handle is in the map");
assert!(
Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
"shard_timeline_id() incorrect?"
);
} else {
return Ok(handle);
}
}
}
#[instrument(level = "trace", skip_all)]
async fn get_impl(
&mut self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
tenant_manager: &T::TenantManager,
) -> Result<Handle<T>, GetError<T>> {
let miss: ShardSelector = {
let routing_state = self.shard_routing(timeline_id, shard_selector);
match routing_state {
RoutingResult::FastPath(handle) => return Ok(handle),
RoutingResult::SlowPath(key) => match self.map.get(&key) {
Some(cached) => match cached.upgrade() {
Some(upgraded) => return Ok(Handle(upgraded)),
None => {
trace!("handle cache stale");
self.map.remove(&key).unwrap();
ShardSelector::Known(key.shard_index)
}
},
None => ShardSelector::Known(key.shard_index),
},
RoutingResult::NeedConsultTenantManager => shard_selector,
}
};
self.get_miss(timeline_id, miss, tenant_manager).await
}
#[inline(always)]
fn shard_routing(
&mut self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
) -> RoutingResult<T> {
loop {
// terminates because when every iteration we remove an element from the map
let Some((first_key, first_handle)) = self.map.iter().next() else {
return RoutingResult::NeedConsultTenantManager;
};
let Some(first_handle) = first_handle.upgrade() else {
// TODO: dedup with get()
trace!("handle cache stale");
let first_key_owned = *first_key;
self.map.remove(&first_key_owned).unwrap();
continue;
};
let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
let make_shard_index = |shard_num: ShardNumber| ShardIndex {
shard_number: shard_num,
shard_count: first_handle_shard_identity.count,
};
let need_idx = match shard_selector {
ShardSelector::Page(key) => {
make_shard_index(first_handle_shard_identity.get_shard_number(&key))
}
ShardSelector::Zero => make_shard_index(ShardNumber(0)),
ShardSelector::Known(shard_idx) => shard_idx,
};
let need_shard_timeline_id = ShardTimelineId {
shard_index: need_idx,
timeline_id,
};
let first_handle_shard_timeline_id = ShardTimelineId {
shard_index: first_handle_shard_identity.shard_index(),
timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
};
if need_shard_timeline_id == first_handle_shard_timeline_id {
return RoutingResult::FastPath(Handle(first_handle));
} else {
return RoutingResult::SlowPath(need_shard_timeline_id);
}
}
}
#[instrument(level = "trace", skip_all)]
#[inline(always)]
async fn get_miss(
&mut self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
tenant_manager: &T::TenantManager,
) -> Result<Handle<T>, GetError<T>> {
match tenant_manager.resolve(timeline_id, shard_selector).await {
Ok(timeline) => {
let key = timeline.shard_timeline_id();
match &shard_selector {
ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
ShardSelector::Page(_) => (), // gotta trust tenant_manager
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
}
let gate_guard = match timeline.gate().enter() {
Ok(guard) => guard,
Err(_) => {
return Err(GetError::TimelineGateClosed);
}
};
trace!("creating new HandleInner");
let handle = Arc::new(
// TODO: global metric that keeps track of the number of live HandlerTimeline instances
// so we can identify reference cycle bugs.
HandleInner {
shut_down: AtomicBool::new(false),
_gate_guard: gate_guard,
timeline: timeline.clone(),
},
);
let handle = {
let mut lock_guard = timeline
.per_timeline_state()
.handles
.lock()
.expect("mutex poisoned");
match &mut *lock_guard {
Some(per_timeline_state) => {
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
assert!(replaced.is_none(), "some earlier code left a stale handle");
match self.map.entry(key) {
hash_map::Entry::Occupied(_o) => {
// This cannot not happen because
// 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
// 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
// while we were waiting for the tenant manager.
unreachable!()
}
hash_map::Entry::Vacant(v) => {
v.insert(Arc::downgrade(&handle));
handle
}
}
}
None => {
return Err(GetError::PerTimelineStateShutDown);
}
}
};
Ok(Handle(handle))
}
Err(e) => Err(GetError::TenantManager(e)),
}
}
}
impl<T: Types> PerTimelineState<T> {
/// After this method returns, [`Cache::get`] will never again return a [`Handle`]
/// to the [`Types::Timeline`] that embeds this per-timeline state.
/// Even if [`TenantManager::resolve`] would still resolve to it.
///
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
/// That's ok because they're short-lived. See module-level comment for details.
#[instrument(level = "trace", skip_all)]
pub(super) fn shutdown(&self) {
let handles = self
.handles
.lock()
.expect("mutex poisoned")
// NB: this .take() sets locked to None.
// That's what makes future `Cache::get` misses fail.
// Cache hits are taken care of below.
.take();
let Some(handles) = handles else {
trace!("already shut down");
return;
};
for handle in handles.values() {
// Make hits fail.
handle.shut_down.store(true, Ordering::Relaxed);
}
drop(handles);
}
}
impl<T: Types> std::ops::Deref for Handle<T> {
type Target = T::Timeline;
fn deref(&self) -> &Self::Target {
&self.0.timeline
}
}
#[cfg(test)]
impl<T: Types> Drop for HandleInner<T> {
fn drop(&mut self) {
trace!("HandleInner dropped");
}
}
// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
impl<T: Types> Drop for Cache<T> {
fn drop(&mut self) {
for (_, weak) in self.map.drain() {
if let Some(strong) = weak.upgrade() {
// handle is still being kept alive in PerTimelineState
let timeline = strong.timeline.per_timeline_state();
let mut handles = timeline.handles.lock().expect("mutex poisoned");
if let Some(handles) = &mut *handles {
let Some(removed) = handles.remove(&self.id) else {
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
continue;
};
assert!(Arc::ptr_eq(&removed, &strong));
}
}
}
}
}
#[cfg(test)]
mod tests {
use pageserver_api::{
key::{rel_block_to_key, Key, DBDIR_KEY},
models::ShardParameters,
reltag::RelTag,
shard::ShardStripeSize,
};
use utils::shard::ShardCount;
use super::*;
const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
#[derive(Debug)]
struct TestTypes;
impl Types for TestTypes {
type TenantManagerError = anyhow::Error;
type TenantManager = StubManager;
type Timeline = Arc<StubTimeline>;
}
struct StubManager {
shards: Vec<Arc<StubTimeline>>,
}
struct StubTimeline {
gate: utils::sync::gate::Gate,
id: TimelineId,
shard: ShardIdentity,
per_timeline_state: PerTimelineState<TestTypes>,
myself: Weak<StubTimeline>,
}
impl StubTimeline {
fn getpage(&self) {
// do nothing
}
}
impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
fn gate(&self) -> &utils::sync::gate::Gate {
&self.gate
}
fn shard_timeline_id(&self) -> ShardTimelineId {
ShardTimelineId {
shard_index: self.shard.shard_index(),
timeline_id: self.id,
}
}
fn get_shard_identity(&self) -> &ShardIdentity {
&self.shard
}
fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
&self.per_timeline_state
}
}
impl TenantManager<TestTypes> for StubManager {
async fn resolve(
&self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
) -> anyhow::Result<Arc<StubTimeline>> {
for timeline in &self.shards {
if timeline.id == timeline_id {
match &shard_selector {
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
return Ok(Arc::clone(timeline));
}
ShardSelector::Zero => continue,
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
return Ok(Arc::clone(timeline));
}
ShardSelector::Page(_) => continue,
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
return Ok(Arc::clone(timeline));
}
ShardSelector::Known(_) => continue,
}
}
}
anyhow::bail!("not found")
}
}
#[tokio::test(start_paused = true)]
async fn test_timeline_shutdown() {
crate::tenant::harness::setup_logging();
let timeline_id = TimelineId::generate();
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let mgr = StubManager {
shards: vec![shard0.clone()],
};
let key = DBDIR_KEY;
let mut cache = Cache::<TestTypes>::default();
//
// fill the cache
//
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(2, 1),
"strong: shard0, mgr; weak: myself"
);
let handle: Handle<_> = cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.expect("we have the timeline");
let handle_inner_weak = Arc::downgrade(&handle.0);
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
assert_eq!(
(
Weak::strong_count(&handle_inner_weak),
Weak::weak_count(&handle_inner_weak)
),
(2, 2),
"strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
);
assert_eq!(cache.map.len(), 1);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(3, 1),
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
);
drop(handle);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(3, 1),
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
);
//
// demonstrate that Handle holds up gate closure
// but shutdown prevents new handles from being handed out
//
tokio::select! {
_ = shard0.gate.close() => {
panic!("cache and per-timeline handler state keep cache open");
}
_ = tokio::time::sleep(FOREVER) => {
// NB: first poll of close() makes it enter closing state
}
}
let handle = cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.expect("we have the timeline");
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
// SHUTDOWN
shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
assert_eq!(
1,
Weak::strong_count(&handle_inner_weak),
"through local var handle"
);
assert_eq!(
cache.map.len(),
1,
"this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(3, 1),
"strong: handleinner(via handle), shard0, mgr; weak: myself"
);
// this handle is perfectly usable
handle.getpage();
cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.err()
.expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
assert_eq!(
cache.map.len(),
0,
"first access after shutdown cleans up the Weak's from the cache"
);
tokio::select! {
_ = shard0.gate.close() => {
panic!("handle is keeping gate open");
}
_ = tokio::time::sleep(FOREVER) => { }
}
drop(handle);
assert_eq!(
0,
Weak::strong_count(&handle_inner_weak),
"the HandleInner destructor already ran"
);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(2, 1),
"strong: shard0, mgr; weak: myself"
);
// closing gate succeeds after dropping handle
tokio::select! {
_ = shard0.gate.close() => { }
_ = tokio::time::sleep(FOREVER) => {
panic!("handle is dropped, no other gate holders exist")
}
}
// map gets cleaned on next lookup
cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.err()
.expect("documented behavior: can't get new handle after shutdown");
assert_eq!(cache.map.len(), 0);
// ensure all refs to shard0 are gone and we're not leaking anything
let myself = Weak::clone(&shard0.myself);
drop(shard0);
drop(mgr);
assert_eq!(Weak::strong_count(&myself), 0);
}
#[tokio::test]
async fn test_multiple_timelines_and_deletion() {
crate::tenant::harness::setup_logging();
let timeline_a = TimelineId::generate();
let timeline_b = TimelineId::generate();
assert_ne!(timeline_a, timeline_b);
let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_a,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_b,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let mut mgr = StubManager {
shards: vec![timeline_a.clone(), timeline_b.clone()],
};
let key = DBDIR_KEY;
let mut cache = Cache::<TestTypes>::default();
cache
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
.await
.expect("we have it");
cache
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
.await
.expect("we have it");
assert_eq!(cache.map.len(), 2);
// delete timeline A
timeline_a.per_timeline_state.shutdown();
mgr.shards.retain(|t| t.id != timeline_a.id);
assert!(
mgr.resolve(timeline_a.id, ShardSelector::Page(key))
.await
.is_err(),
"broken StubManager implementation"
);
assert_eq!(
cache.map.len(),
2,
"cache still has a Weak handle to Timeline A"
);
cache
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
.await
.err()
.expect("documented behavior: can't get new handle after shutdown");
assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
cache
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
.await
.expect("we still have it");
}
fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
rel_block_to_key(
RelTag {
spcnode: 1663,
dbnode: 208101,
relnode: 2620,
forknum: 0,
},
shard.0 as u32 * params.stripe_size.0,
)
}
#[tokio::test(start_paused = true)]
async fn test_shard_split() {
crate::tenant::harness::setup_logging();
let timeline_id = TimelineId::generate();
let parent = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let child_params = ShardParameters {
count: ShardCount(2),
stripe_size: ShardStripeSize::default(),
};
let child0 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let child1 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let child_shards_by_shard_number = [child0.clone(), child1.clone()];
let mut cache = Cache::<TestTypes>::default();
// fill the cache with the parent
for i in 0..2 {
let handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
&StubManager {
shards: vec![parent.clone()],
},
)
.await
.expect("we have it");
assert!(
Weak::ptr_eq(&handle.myself, &parent.myself),
"mgr returns parent first"
);
drop(handle);
}
//
// SHARD SPLIT: tenant manager changes, but the cache isn't informed
//
// while we haven't shut down the parent, the cache will return the cached parent, even
// if the tenant manager returns the child
for i in 0..2 {
let handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
&StubManager {
shards: vec![], // doesn't matter what's in here, the cache is fully loaded
},
)
.await
.expect("we have it");
assert!(
Weak::ptr_eq(&handle.myself, &parent.myself),
"mgr returns parent"
);
drop(handle);
}
let parent_handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
&StubManager {
shards: vec![parent.clone()],
},
)
.await
.expect("we have it");
assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
// invalidate the cache
parent.per_timeline_state.shutdown();
// the cache will now return the child, even though the parent handle still exists
for i in 0..2 {
let handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
&StubManager {
shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
},
)
.await
.expect("we have it");
assert!(
Weak::ptr_eq(
&handle.myself,
&child_shards_by_shard_number[i as usize].myself
),
"mgr returns child"
);
drop(handle);
}
// all the while the parent handle kept the parent gate open
tokio::select! {
_ = parent_handle.gate.close() => {
panic!("parent handle is keeping gate open");
}
_ = tokio::time::sleep(FOREVER) => { }
}
drop(parent_handle);
tokio::select! {
_ = parent.gate.close() => { }
_ = tokio::time::sleep(FOREVER) => {
panic!("parent handle is dropped, no other gate holders exist")
}
}
}
#[tokio::test(start_paused = true)]
async fn test_connection_handler_exit() {
crate::tenant::harness::setup_logging();
let timeline_id = TimelineId::generate();
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let mgr = StubManager {
shards: vec![shard0.clone()],
};
let key = DBDIR_KEY;
// Simulate 10 connections that's opened, used, and closed
let mut used_handles = vec![];
for _ in 0..10 {
let mut cache = Cache::<TestTypes>::default();
let handle = {
let handle = cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.expect("we have the timeline");
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
handle
};
handle.getpage();
used_handles.push(Arc::downgrade(&handle.0));
}
// No handles exist, thus gates are closed and don't require shutdown
assert!(used_handles
.iter()
.all(|weak| Weak::strong_count(weak) == 0));
// ... thus the gate should close immediately, even without shutdown
tokio::select! {
_ = shard0.gate.close() => { }
_ = tokio::time::sleep(FOREVER) => {
panic!("handle is dropped, no other gate holders exist")
}
}
}
}

View File

@@ -35,6 +35,10 @@ impl LayerManager {
self.layer_fmgr.get_from_desc(desc)
}
pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
self.layer_fmgr.get_from_key(desc)
}
/// Get an immutable reference to the layer map.
///
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
@@ -365,16 +369,20 @@ impl<T> Default for LayerFileManager<T> {
}
impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
fn get_from_key(&self, key: &PersistentLayerKey) -> T {
// The assumption for the `expect()` is that all code maintains the following invariant:
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
self.0
.get(&desc.key())
.with_context(|| format!("get layer from desc: {}", desc.layer_name()))
.get(key)
.with_context(|| format!("get layer from key: {}", key))
.expect("not found")
.clone()
}
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
self.get_from_key(&desc.key())
}
fn contains_key(&self, key: &PersistentLayerKey) -> bool {
self.0.contains_key(key)
}

View File

@@ -241,6 +241,9 @@ impl PostgresRedoManager {
/// Shut down the WAL redo manager.
///
/// Returns `true` if this call was the one that initiated shutdown.
/// `true` may be observed by no caller if the first caller stops polling.
///
/// After this future completes
/// - no redo process is running
/// - no new redo process will be spawned
@@ -250,22 +253,32 @@ impl PostgresRedoManager {
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub async fn shutdown(&self) {
pub async fn shutdown(&self) -> bool {
// prevent new processes from being spawned
let permit = match self.redo_process.get_or_init_detached().await {
let maybe_permit = match self.redo_process.get_or_init_detached().await {
Ok(guard) => {
let (proc, permit) = guard.take_and_deinit();
drop(proc); // this just drops the Arc, its refcount may not be zero yet
permit
if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
None
} else {
let (proc, permit) = guard.take_and_deinit();
drop(proc); // this just drops the Arc, its refcount may not be zero yet
Some(permit)
}
}
Err(permit) => permit,
Err(permit) => Some(permit),
};
let it_was_us = if let Some(permit) = maybe_permit {
self.redo_process
.set(ProcessOnceCell::ManagerShutDown, permit);
true
} else {
false
};
self.redo_process
.set(ProcessOnceCell::ManagerShutDown, permit);
// wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
// we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
// for the underlying process.
self.launched_processes.close().await;
it_was_us
}
/// This type doesn't have its own background task to check for idleness: we

View File

@@ -0,0 +1,7 @@
# This was captured from one shard of a large tenant in staging.
# It has a mixture of deltas and image layers, >1000 layers in total.
# This is suitable for general smoke tests that want an index which is not
# trivially small, but doesn't contain weird/pathological cases.

File diff suppressed because one or more lines are too long

153
poetry.lock generated
View File

@@ -870,6 +870,96 @@ files = [
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "clickhouse-connect"
version = "0.7.17"
description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
optional = false
python-versions = "~=3.8"
files = [
{file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
]
[package.dependencies]
certifi = "*"
lz4 = "*"
pytz = "*"
urllib3 = ">=1.26"
zstandard = "*"
[package.extras]
arrow = ["pyarrow"]
numpy = ["numpy"]
orjson = ["orjson"]
pandas = ["pandas"]
sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
tzlocal = ["tzlocal (>=4.0)"]
[[package]]
name = "colorama"
version = "0.4.5"
@@ -1470,6 +1560,56 @@ files = [
{file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
]
[[package]]
name = "lz4"
version = "4.3.3"
description = "LZ4 Bindings for Python"
optional = false
python-versions = ">=3.8"
files = [
{file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
{file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
{file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
{file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
{file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
{file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
{file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
{file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
{file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
{file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
{file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
{file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
{file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
{file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
{file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
{file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
{file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
{file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
{file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
{file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
{file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
]
[package.extras]
docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
flake8 = ["flake8"]
tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
[[package]]
name = "markupsafe"
version = "2.1.1"
@@ -2361,6 +2501,17 @@ files = [
[package.dependencies]
six = ">=1.5"
[[package]]
name = "pytz"
version = "2024.1"
description = "World timezone definitions, modern and historical"
optional = false
python-versions = "*"
files = [
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
]
[[package]]
name = "pywin32"
version = "301"
@@ -3206,4 +3357,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"

View File

@@ -41,6 +41,7 @@ zstandard = "^0.21.0"
httpx = {extras = ["http2"], version = "^0.26.0"}
pytest-repeat = "^0.9.3"
websockets = "^12.0"
clickhouse-connect = "^0.7.16"
[tool.poetry.group.dev.dependencies]
mypy = "==1.3.0"

View File

@@ -21,6 +21,7 @@ pub mod json_ctrl;
pub mod metrics;
pub mod patch_control_file;
pub mod pull_timeline;
pub mod rate_limit;
pub mod receive_wal;
pub mod recovery;
pub mod remove_wal;
@@ -53,6 +54,7 @@ pub mod defaults {
pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2;
// By default, our required residency before eviction is the same as the period that passes
// before uploading a partial segment, so that in normal operation the eviction can happen

View File

@@ -0,0 +1,49 @@
use std::sync::Arc;
use rand::Rng;
use crate::metrics::MISC_OPERATION_SECONDS;
/// Global rate limiter for background tasks.
#[derive(Clone)]
pub struct RateLimiter {
partial_backup: Arc<tokio::sync::Semaphore>,
eviction: Arc<tokio::sync::Semaphore>,
}
impl RateLimiter {
/// Create a new rate limiter.
/// - `partial_backup_max`: maximum number of concurrent partial backups.
/// - `eviction_max`: maximum number of concurrent timeline evictions.
pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self {
Self {
partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)),
eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)),
}
}
/// Get a permit for partial backup. This will block if the maximum number of concurrent
/// partial backups is reached.
pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit {
let _timer = MISC_OPERATION_SECONDS
.with_label_values(&["partial_permit_acquire"])
.start_timer();
self.partial_backup
.clone()
.acquire_owned()
.await
.expect("semaphore is closed")
}
/// Try to get a permit for timeline eviction. This will return None if the maximum number of
/// concurrent timeline evictions is reached.
pub fn try_acquire_eviction(&self) -> Option<tokio::sync::OwnedSemaphorePermit> {
self.eviction.clone().try_acquire_owned().ok()
}
}
/// Generate a random duration that is a fraction of the given duration.
pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration {
let randf64 = rand::thread_rng().gen_range(0.0..1.0);
duration.mul_f64(randf64)
}

View File

@@ -25,6 +25,7 @@ use utils::{
use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use crate::rate_limit::RateLimiter;
use crate::receive_wal::WalReceivers;
use crate::safekeeper::{
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
@@ -36,7 +37,7 @@ use crate::timeline_guard::ResidenceGuard;
use crate::timeline_manager::{AtomicStatus, ManagerCtl};
use crate::timelines_set::TimelinesSet;
use crate::wal_backup::{self};
use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter};
use crate::wal_backup_partial::PartialRemoteSegment;
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};

View File

@@ -5,7 +5,6 @@
use anyhow::Context;
use camino::Utf8PathBuf;
use remote_storage::RemotePath;
use std::time::Instant;
use tokio::{
fs::File,
io::{AsyncRead, AsyncWriteExt},
@@ -15,6 +14,7 @@ use utils::crashsafe::durable_rename;
use crate::{
metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
rate_limit::rand_duration,
timeline_manager::{Manager, StateSnapshot},
wal_backup,
wal_backup_partial::{self, PartialRemoteSegment},
@@ -50,7 +50,6 @@ impl Manager {
.flush_lsn
.segment_number(self.wal_seg_size)
== self.last_removed_segno + 1
&& self.resident_since.elapsed() >= self.conf.eviction_min_resident
}
/// Evict the timeline to remote storage.
@@ -112,7 +111,8 @@ impl Manager {
return;
}
self.resident_since = Instant::now();
self.evict_not_before =
tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
info!("successfully restored evicted timeline");
}

View File

@@ -23,6 +23,7 @@ use utils::lsn::Lsn;
use crate::{
control_file::{FileStorage, Storage},
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
rate_limit::{rand_duration, RateLimiter},
recovery::recovery_main,
remove_wal::calc_horizon_lsn,
safekeeper::Term,
@@ -32,7 +33,7 @@ use crate::{
timeline_guard::{AccessService, GuardId, ResidenceGuard},
timelines_set::{TimelineSetGuard, TimelinesSet},
wal_backup::{self, WalBackupTaskHandle},
wal_backup_partial::{self, PartialRemoteSegment, RateLimiter},
wal_backup_partial::{self, PartialRemoteSegment},
SafeKeeperConf,
};
@@ -185,11 +186,11 @@ pub(crate) struct Manager {
// misc
pub(crate) access_service: AccessService,
pub(crate) partial_backup_rate_limiter: RateLimiter,
pub(crate) global_rate_limiter: RateLimiter,
// Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
// evict them if they go inactive very soon after being restored.
pub(crate) resident_since: std::time::Instant,
pub(crate) evict_not_before: Instant,
}
/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -202,7 +203,7 @@ pub async fn main_task(
broker_active_set: Arc<TimelinesSet>,
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
partial_backup_rate_limiter: RateLimiter,
global_rate_limiter: RateLimiter,
) {
tli.set_status(Status::Started);
@@ -220,7 +221,7 @@ pub async fn main_task(
conf,
broker_active_set,
manager_tx,
partial_backup_rate_limiter,
global_rate_limiter,
)
.await;
@@ -254,9 +255,29 @@ pub async fn main_task(
mgr.set_status(Status::UpdatePartialBackup);
mgr.update_partial_backup(&state_snapshot).await;
if mgr.conf.enable_offload && mgr.ready_for_eviction(&next_event, &state_snapshot) {
mgr.set_status(Status::EvictTimeline);
mgr.evict_timeline().await;
let now = Instant::now();
if mgr.evict_not_before > now {
// we should wait until evict_not_before
update_next_event(&mut next_event, mgr.evict_not_before);
}
if mgr.conf.enable_offload
&& mgr.evict_not_before <= now
&& mgr.ready_for_eviction(&next_event, &state_snapshot)
{
// check rate limiter and evict timeline if possible
match mgr.global_rate_limiter.try_acquire_eviction() {
Some(_permit) => {
mgr.set_status(Status::EvictTimeline);
mgr.evict_timeline().await;
}
None => {
// we can't evict timeline now, will try again later
mgr.evict_not_before =
Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
update_next_event(&mut next_event, mgr.evict_not_before);
}
}
}
}
@@ -334,11 +355,10 @@ impl Manager {
conf: SafeKeeperConf,
broker_active_set: Arc<TimelinesSet>,
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
partial_backup_rate_limiter: RateLimiter,
global_rate_limiter: RateLimiter,
) -> Manager {
let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
Manager {
conf,
wal_seg_size: tli.get_wal_seg_size().await,
walsenders: tli.get_walsenders().clone(),
state_version_rx: tli.get_state_version_rx(),
@@ -353,8 +373,10 @@ impl Manager {
partial_backup_uploaded,
access_service: AccessService::new(manager_tx),
tli,
partial_backup_rate_limiter,
resident_since: std::time::Instant::now(),
global_rate_limiter,
// to smooth out evictions spike after restart
evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident),
conf,
}
}
@@ -541,7 +563,7 @@ impl Manager {
self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
self.wal_resident_timeline(),
self.conf.clone(),
self.partial_backup_rate_limiter.clone(),
self.global_rate_limiter.clone(),
)));
}

View File

@@ -2,10 +2,11 @@
//! All timelines should always be present in this map, this is done by loading them
//! all from the disk on startup and keeping them in memory.
use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
use crate::rate_limit::RateLimiter;
use crate::safekeeper::ServerInfo;
use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
use crate::timelines_set::TimelinesSet;
use crate::wal_backup_partial::RateLimiter;
use crate::SafeKeeperConf;
use anyhow::{bail, Context, Result};
use camino::Utf8PathBuf;
@@ -31,7 +32,7 @@ struct GlobalTimelinesState {
conf: Option<SafeKeeperConf>,
broker_active_set: Arc<TimelinesSet>,
load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
partial_backup_rate_limiter: RateLimiter,
global_rate_limiter: RateLimiter,
}
// Used to prevent concurrent timeline loading.
@@ -50,7 +51,7 @@ impl GlobalTimelinesState {
(
self.get_conf().clone(),
self.broker_active_set.clone(),
self.partial_backup_rate_limiter.clone(),
self.global_rate_limiter.clone(),
)
}
@@ -85,7 +86,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
conf: None,
broker_active_set: Arc::new(TimelinesSet::default()),
load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
partial_backup_rate_limiter: RateLimiter::new(1),
global_rate_limiter: RateLimiter::new(1, 1),
})
});
@@ -99,7 +100,10 @@ impl GlobalTimelines {
// lock, so use explicit block
let tenants_dir = {
let mut state = TIMELINES_STATE.lock().unwrap();
state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency);
state.global_rate_limiter = RateLimiter::new(
conf.partial_backup_concurrency,
DEFAULT_EVICTION_CONCURRENCY,
);
state.conf = Some(conf);
// Iterate through all directories and load tenants for all directories

View File

@@ -18,8 +18,6 @@
//! This way control file stores information about all potentially existing
//! remote partial segments and can clean them up after uploading a newer version.
use std::sync::Arc;
use camino::Utf8PathBuf;
use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
use remote_storage::RemotePath;
@@ -30,6 +28,7 @@ use utils::lsn::Lsn;
use crate::{
metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
rate_limit::{rand_duration, RateLimiter},
safekeeper::Term,
timeline::WalResidentTimeline,
timeline_manager::StateSnapshot,
@@ -37,30 +36,6 @@ use crate::{
SafeKeeperConf,
};
#[derive(Clone)]
pub struct RateLimiter {
semaphore: Arc<tokio::sync::Semaphore>,
}
impl RateLimiter {
pub fn new(permits: usize) -> Self {
Self {
semaphore: Arc::new(tokio::sync::Semaphore::new(permits)),
}
}
async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit {
let _timer = MISC_OPERATION_SECONDS
.with_label_values(&["partial_permit_acquire"])
.start_timer();
self.semaphore
.clone()
.acquire_owned()
.await
.expect("semaphore is closed")
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub enum UploadStatus {
/// Upload is in progress. This status should be used only for garbage collection,
@@ -352,6 +327,7 @@ pub async fn main_task(
) -> Option<PartialRemoteSegment> {
debug!("started");
let await_duration = conf.partial_backup_timeout;
let mut first_iteration = true;
let (_, persistent_state) = tli.get_state().await;
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
@@ -419,6 +395,15 @@ pub async fn main_task(
}
}
// smoothing the load after restart, by sleeping for a random time.
// if this is not the first iteration, we will wait for the full await_duration
let await_duration = if first_iteration {
first_iteration = false;
rand_duration(&await_duration)
} else {
await_duration
};
// fixing the segno and waiting some time to prevent reuploading the same segment too often
let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
let timeout = tokio::time::sleep(await_duration);
@@ -454,7 +439,7 @@ pub async fn main_task(
}
// limit concurrent uploads
let _upload_permit = limiter.acquire_owned().await;
let _upload_permit = limiter.acquire_partial_backup().await;
let prepared = backup.prepare_upload().await;
if let Some(seg) = &uploaded_segment {

View File

@@ -67,6 +67,7 @@ FALLBACK_DURATION = {
"test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
"test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
"test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,

View File

@@ -18,6 +18,7 @@ anyhow.workspace = true
aws-config.workspace = true
bytes.workspace = true
camino.workspace = true
chrono.workspace = true
clap.workspace = true
fail.workspace = true
futures.workspace = true
@@ -31,6 +32,7 @@ once_cell.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
postgres_connection.workspace = true
rand.workspace = true
reqwest = { workspace = true, features = ["stream"] }
routerify.workspace = true
serde.workspace = true
@@ -44,7 +46,12 @@ scopeguard.workspace = true
strum.workspace = true
strum_macros.workspace = true
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
diesel = { version = "2.1.4", features = [
"serde_json",
"postgres",
"r2d2",
"chrono",
] }
diesel_migrations = { version = "2.1.0" }
r2d2 = { version = "0.8.10" }
@@ -52,4 +59,3 @@ utils = { path = "../libs/utils/" }
metrics = { path = "../libs/metrics/" }
control_plane = { path = "../control_plane" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -0,0 +1 @@
DROP TABLE metadata_health;

View File

@@ -0,0 +1,14 @@
CREATE TABLE metadata_health (
tenant_id VARCHAR NOT NULL,
shard_number INTEGER NOT NULL,
shard_count INTEGER NOT NULL,
PRIMARY KEY(tenant_id, shard_number, shard_count),
-- Rely on cascade behavior for delete
FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE,
healthy BOOLEAN NOT NULL DEFAULT TRUE,
last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
INSERT INTO metadata_health(tenant_id, shard_number, shard_count)
SELECT tenant_id, shard_number, shard_count FROM tenant_shards;

View File

@@ -10,7 +10,11 @@ use hyper::header::CONTENT_TYPE;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use metrics::{BuildInfo, NeonMetrics};
use pageserver_api::controller_api::TenantCreateRequest;
use pageserver_api::controller_api::{
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
TenantCreateRequest,
};
use pageserver_api::models::{
TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest,
@@ -560,6 +564,51 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
json_response(StatusCode::ACCEPTED, ())
}
async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Scrubber)?;
let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
let state = get_state(&req);
state.service.metadata_health_update(update_req).await?;
json_response(StatusCode::OK, MetadataHealthUpdateResponse {})
}
async fn handle_metadata_health_list_unhealthy(
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
json_response(
StatusCode::OK,
MetadataHealthListUnhealthyResponse {
unhealthy_tenant_shards,
},
)
}
async fn handle_metadata_health_list_outdated(
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
let state = get_state(&req);
let health_records = state
.service
.metadata_health_list_outdated(list_outdated_req.not_scrubbed_for)
.await?;
json_response(
StatusCode::OK,
MetadataHealthListOutdatedResponse { health_records },
)
}
async fn handle_tenant_shard_split(
service: Arc<Service>,
mut req: Request<Body>,
@@ -987,6 +1036,28 @@ pub fn make_router(
RequestName("control_v1_cancel_node_fill"),
)
})
// Metadata health operations
.post("/control/v1/metadata_health/update", |r| {
named_request_span(
r,
handle_metadata_health_update,
RequestName("control_v1_metadata_health_update"),
)
})
.get("/control/v1/metadata_health/unhealthy", |r| {
named_request_span(
r,
handle_metadata_health_list_unhealthy,
RequestName("control_v1_metadata_health_list_unhealthy"),
)
})
.post("/control/v1/metadata_health/outdated", |r| {
named_request_span(
r,
handle_metadata_health_list_outdated,
RequestName("control_v1_metadata_health_list_outdated"),
)
})
// TODO(vlad): endpoint for cancelling drain and fill
// Tenant Shard operations
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {

View File

@@ -9,12 +9,14 @@ use std::time::Duration;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
use storage_controller::service::chaos_injector::ChaosInjector;
use storage_controller::service::{
Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
RECONCILER_CONCURRENCY_DEFAULT,
};
use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use utils::auth::{JwtAuth, SwappableJwtAuth};
use utils::logging::{self, LogFormat};
@@ -86,6 +88,10 @@ struct Cli {
// TODO: make `cfg(feature = "testing")`
#[arg(long)]
neon_local_repo_dir: Option<PathBuf>,
/// Chaos testing
#[arg(long)]
chaos_interval: Option<humantime::Duration>,
}
enum StrictMode {
@@ -309,6 +315,22 @@ async fn async_main() -> anyhow::Result<()> {
tracing::info!("Serving on {0}", args.listen);
let server_task = tokio::task::spawn(server);
let chaos_task = args.chaos_interval.map(|interval| {
let service = service.clone();
let cancel = CancellationToken::new();
let cancel_bg = cancel.clone();
(
tokio::task::spawn(
async move {
let mut chaos_injector = ChaosInjector::new(service, interval.into());
chaos_injector.run(cancel_bg).await
}
.instrument(tracing::info_span!("chaos_injector")),
),
cancel,
)
});
// Wait until we receive a signal
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
@@ -337,6 +359,12 @@ async fn async_main() -> anyhow::Result<()> {
}
}
// If we were injecting chaos, stop that so that we're not calling into Service while it shuts down
if let Some((chaos_jh, chaos_cancel)) = chaos_task {
chaos_cancel.cancel();
chaos_jh.await.ok();
}
service.shutdown().await;
tracing::info!("Service shutdown complete");

View File

@@ -8,6 +8,7 @@ use self::split_state::SplitState;
use diesel::pg::PgConnection;
use diesel::prelude::*;
use diesel::Connection;
use pageserver_api::controller_api::MetadataHealthRecord;
use pageserver_api::controller_api::ShardSchedulingPolicy;
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
use pageserver_api::models::TenantConfig;
@@ -90,6 +91,10 @@ pub(crate) enum DatabaseOperation {
UpdateTenantShard,
DeleteTenant,
UpdateTenantConfig,
UpdateMetadataHealth,
ListMetadataHealth,
ListMetadataHealthUnhealthy,
ListMetadataHealthOutdated,
}
#[must_use]
@@ -307,15 +312,32 @@ impl Persistence {
&self,
shards: Vec<TenantShardPersistence>,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
use crate::schema::metadata_health;
use crate::schema::tenant_shards;
let now = chrono::Utc::now();
let metadata_health_records = shards
.iter()
.map(|t| MetadataHealthPersistence {
tenant_id: t.tenant_id.clone(),
shard_number: t.shard_number,
shard_count: t.shard_count,
healthy: true,
last_scrubbed_at: now,
})
.collect::<Vec<_>>();
self.with_measured_conn(
DatabaseOperation::InsertTenantShards,
move |conn| -> DatabaseResult<()> {
for tenant in &shards {
diesel::insert_into(tenant_shards)
.values(tenant)
.execute(conn)?;
}
diesel::insert_into(tenant_shards::table)
.values(&shards)
.execute(conn)?;
diesel::insert_into(metadata_health::table)
.values(&metadata_health_records)
.execute(conn)?;
Ok(())
},
)
@@ -329,10 +351,10 @@ impl Persistence {
self.with_measured_conn(
DatabaseOperation::DeleteTenant,
move |conn| -> DatabaseResult<()> {
// `metadata_health` status (if exists) is also deleted based on the cascade behavior.
diesel::delete(tenant_shards)
.filter(tenant_id.eq(del_tenant_id.to_string()))
.execute(conn)?;
Ok(())
},
)
@@ -675,6 +697,94 @@ impl Persistence {
)
.await
}
/// Stores all the latest metadata health updates durably. Updates existing entry on conflict.
///
/// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller.
#[allow(dead_code)]
pub(crate) async fn update_metadata_health_records(
&self,
healthy_records: Vec<MetadataHealthPersistence>,
unhealthy_records: Vec<MetadataHealthPersistence>,
now: chrono::DateTime<chrono::Utc>,
) -> DatabaseResult<()> {
use crate::schema::metadata_health::dsl::*;
self.with_measured_conn(
DatabaseOperation::UpdateMetadataHealth,
move |conn| -> DatabaseResult<_> {
diesel::insert_into(metadata_health)
.values(&healthy_records)
.on_conflict((tenant_id, shard_number, shard_count))
.do_update()
.set((healthy.eq(true), last_scrubbed_at.eq(now)))
.execute(conn)?;
diesel::insert_into(metadata_health)
.values(&unhealthy_records)
.on_conflict((tenant_id, shard_number, shard_count))
.do_update()
.set((healthy.eq(false), last_scrubbed_at.eq(now)))
.execute(conn)?;
Ok(())
},
)
.await
}
/// Lists all the metadata health records.
#[allow(dead_code)]
pub(crate) async fn list_metadata_health_records(
&self,
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
self.with_measured_conn(
DatabaseOperation::ListMetadataHealth,
move |conn| -> DatabaseResult<_> {
Ok(
crate::schema::metadata_health::table
.load::<MetadataHealthPersistence>(conn)?,
)
},
)
.await
}
/// Lists all the metadata health records that is unhealthy.
#[allow(dead_code)]
pub(crate) async fn list_unhealthy_metadata_health_records(
&self,
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
use crate::schema::metadata_health::dsl::*;
self.with_measured_conn(
DatabaseOperation::ListMetadataHealthUnhealthy,
move |conn| -> DatabaseResult<_> {
Ok(crate::schema::metadata_health::table
.filter(healthy.eq(false))
.load::<MetadataHealthPersistence>(conn)?)
},
)
.await
}
/// Lists all the metadata health records that have not been updated since an `earlier` time.
#[allow(dead_code)]
pub(crate) async fn list_outdated_metadata_health_records(
&self,
earlier: chrono::DateTime<chrono::Utc>,
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
use crate::schema::metadata_health::dsl::*;
self.with_measured_conn(
DatabaseOperation::ListMetadataHealthOutdated,
move |conn| -> DatabaseResult<_> {
let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
let res = query.load::<MetadataHealthPersistence>(conn)?;
Ok(res)
},
)
.await
}
}
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -744,3 +854,59 @@ pub(crate) struct NodePersistence {
pub(crate) listen_pg_addr: String,
pub(crate) listen_pg_port: i32,
}
/// Tenant metadata health status that are stored durably.
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
#[diesel(table_name = crate::schema::metadata_health)]
pub(crate) struct MetadataHealthPersistence {
#[serde(default)]
pub(crate) tenant_id: String,
#[serde(default)]
pub(crate) shard_number: i32,
#[serde(default)]
pub(crate) shard_count: i32,
pub(crate) healthy: bool,
pub(crate) last_scrubbed_at: chrono::DateTime<chrono::Utc>,
}
impl MetadataHealthPersistence {
pub fn new(
tenant_shard_id: TenantShardId,
healthy: bool,
last_scrubbed_at: chrono::DateTime<chrono::Utc>,
) -> Self {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_number = tenant_shard_id.shard_number.0 as i32;
let shard_count = tenant_shard_id.shard_count.literal() as i32;
MetadataHealthPersistence {
tenant_id,
shard_number,
shard_count,
healthy,
last_scrubbed_at,
}
}
#[allow(dead_code)]
pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
Ok(TenantShardId {
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
shard_number: ShardNumber(self.shard_number as u8),
shard_count: ShardCount::new(self.shard_count as u8),
})
}
}
impl From<MetadataHealthPersistence> for MetadataHealthRecord {
fn from(value: MetadataHealthPersistence) -> Self {
MetadataHealthRecord {
tenant_shard_id: value
.get_tenant_shard_id()
.expect("stored tenant id should be valid"),
healthy: value.healthy,
last_scrubbed_at: value.last_scrubbed_at,
}
}
}

View File

@@ -656,11 +656,8 @@ impl Reconciler {
// reconcile this location. This includes locations with different configurations, as well
// as locations with unknown (None) observed state.
// The general case is to increment the generation. However, there are cases
// where this is not necessary:
// - if we are only updating the TenantConf part of the location
// - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
// and the location was already in the correct generation
// Incrementing generation is the safe general case, but is inefficient for changes that only
// modify some details (e.g. the tenant's config).
let increment_generation = match observed {
None => true,
Some(ObservedStateLocation { conf: None }) => true,
@@ -669,18 +666,11 @@ impl Reconciler {
}) => {
let generations_match = observed.generation == wanted_conf.generation;
use LocationConfigMode::*;
let mode_transition_requires_gen_inc =
match (observed.mode, wanted_conf.mode) {
// Usually the short-lived attachment modes (multi and stale) are only used
// in the case of [`Self::live_migrate`], but it is simple to handle them correctly
// here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation.
(AttachedSingle, AttachedStale) => false,
(AttachedMulti, AttachedSingle) => false,
(lhs, rhs) => lhs != rhs,
};
!generations_match || mode_transition_requires_gen_inc
// We may skip incrementing the generation if the location is already in the expected mode and
// generation. In principle it would also be safe to skip from certain other modes (e.g. AttachedStale),
// but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up
// after a restart/crash, so fall back to the universally safe path of incrementing generation.
!generations_match || (observed.mode != wanted_conf.mode)
}
};

View File

@@ -1,5 +1,15 @@
// @generated automatically by Diesel CLI.
diesel::table! {
metadata_health (tenant_id, shard_number, shard_count) {
tenant_id -> Varchar,
shard_number -> Int4,
shard_count -> Int4,
healthy -> Bool,
last_scrubbed_at -> Timestamptz,
}
}
diesel::table! {
nodes (node_id) {
node_id -> Int8,
@@ -26,4 +36,4 @@ diesel::table! {
}
}
diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);

View File

@@ -16,7 +16,7 @@ use crate::{
compute_hook::NotifyError,
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
metrics::LeadershipStatusGroup,
persistence::{AbortShardSplitStatus, TenantFilter},
persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
reconciler::{ReconcileError, ReconcileUnits},
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
tenant_shard::{
@@ -33,11 +33,11 @@ use futures::{stream::FuturesUnordered, StreamExt};
use itertools::Itertools;
use pageserver_api::{
controller_api::{
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
TenantShardMigrateResponse, UtilizationScore,
MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
},
models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
};
@@ -84,6 +84,8 @@ use crate::{
};
use serde::{Deserialize, Serialize};
pub mod chaos_injector;
// For operations that should be quick, like attaching a new tenant
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -6095,6 +6097,68 @@ impl Service {
Ok(())
}
/// Updates scrubber metadata health check results.
pub(crate) async fn metadata_health_update(
&self,
update_req: MetadataHealthUpdateRequest,
) -> Result<(), ApiError> {
let now = chrono::offset::Utc::now();
let (healthy_records, unhealthy_records) = {
let locked = self.inner.read().unwrap();
let healthy_records = update_req
.healthy_tenant_shards
.into_iter()
// Retain only health records associated with tenant shards managed by storage controller.
.filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
.map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now))
.collect();
let unhealthy_records = update_req
.unhealthy_tenant_shards
.into_iter()
.filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
.map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now))
.collect();
(healthy_records, unhealthy_records)
};
self.persistence
.update_metadata_health_records(healthy_records, unhealthy_records, now)
.await?;
Ok(())
}
/// Lists the tenant shards that has unhealthy metadata status.
pub(crate) async fn metadata_health_list_unhealthy(
&self,
) -> Result<Vec<TenantShardId>, ApiError> {
let result = self
.persistence
.list_unhealthy_metadata_health_records()
.await?
.iter()
.map(|p| p.get_tenant_shard_id().unwrap())
.collect();
Ok(result)
}
/// Lists the tenant shards that have not been scrubbed for some duration.
pub(crate) async fn metadata_health_list_outdated(
&self,
not_scrubbed_for: Duration,
) -> Result<Vec<MetadataHealthRecord>, ApiError> {
let earlier = chrono::offset::Utc::now() - not_scrubbed_for;
let result = self
.persistence
.list_outdated_metadata_health_records(earlier)
.await?
.into_iter()
.map(|record| record.into())
.collect();
Ok(result)
}
pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
self.inner.read().unwrap().get_leadership_status()
}

View File

@@ -0,0 +1,71 @@
use std::{sync::Arc, time::Duration};
use rand::seq::SliceRandom;
use rand::thread_rng;
use tokio_util::sync::CancellationToken;
use super::Service;
pub struct ChaosInjector {
service: Arc<Service>,
interval: Duration,
}
impl ChaosInjector {
pub fn new(service: Arc<Service>, interval: Duration) -> Self {
Self { service, interval }
}
pub async fn run(&mut self, cancel: CancellationToken) {
let mut interval = tokio::time::interval(self.interval);
loop {
tokio::select! {
_ = interval.tick() => {}
_ = cancel.cancelled() => {
tracing::info!("Shutting down");
return;
}
}
self.inject_chaos().await;
tracing::info!("Chaos iteration...");
}
}
async fn inject_chaos(&mut self) {
// Pick some shards to interfere with
let batch_size = 128;
let mut inner = self.service.inner.write().unwrap();
let (nodes, tenants, scheduler) = inner.parts_mut();
let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
for victim in victims {
let shard = tenants
.get_mut(victim)
.expect("Held lock between choosing ID and this get");
// Pick a secondary to promote
let Some(new_location) = shard
.intent
.get_secondary()
.choose(&mut thread_rng())
.cloned()
else {
tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
continue;
};
let Some(old_location) = *shard.intent.get_attached() else {
tracing::info!("Skipping shard {victim}: currently has no attached location");
continue;
};
shard.intent.demote_attached(scheduler, old_location);
shard.intent.promote_attached(scheduler, new_location);
self.service.maybe_reconcile_shard(shard, nodes);
}
}
}

View File

@@ -10,6 +10,7 @@ aws-smithy-async.workspace = true
either.workspace = true
tokio-rustls.workspace = true
anyhow.workspace = true
git-version.workspace = true
hex.workspace = true
humantime.workspace = true
thiserror.workspace = true

View File

@@ -40,6 +40,11 @@ impl TimelineAnalysis {
garbage_keys: Vec::new(),
}
}
/// Whether a timeline is healthy.
pub(crate) fn is_healthy(&self) -> bool {
self.errors.is_empty() && self.warnings.is_empty()
}
}
pub(crate) async fn branch_cleanup_and_check_errors(

View File

@@ -1,10 +1,13 @@
use std::pin::pin;
use futures::{StreamExt, TryStreamExt};
use pageserver::tenant::storage_layer::LayerName;
use remote_storage::ListingMode;
use serde::{Deserialize, Serialize};
use crate::{
checks::parse_layer_object_name, init_remote, list_objects_with_retries,
metadata_stream::stream_tenants, BucketConfig, NodeKind,
checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
stream_objects_with_retries, BucketConfig, NodeKind,
};
#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -47,45 +50,38 @@ pub async fn find_large_objects(
ignore_deltas: bool,
concurrency: usize,
) -> anyhow::Result<LargeObjectListing> {
let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
let (remote_client, target) =
init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
let tenants = pin!(stream_tenants_generic(&remote_client, &target));
let objects_stream = tenants.map_ok(|tenant_shard_id| {
let mut tenant_root = target.tenant_root(&tenant_shard_id);
let s3_client = s3_client.clone();
let remote_client = remote_client.clone();
async move {
let mut objects = Vec::new();
let mut total_objects_ctr = 0u64;
// We want the objects and not just common prefixes
tenant_root.delimiter.clear();
let mut continuation_token = None;
loop {
let fetch_response =
list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
.await?;
for obj in fetch_response.contents().iter().filter(|o| {
if let Some(obj_size) = o.size {
min_size as i64 <= obj_size
} else {
false
}
}) {
let key = obj.key().expect("couldn't get key").to_owned();
let mut objects_stream = pin!(stream_objects_with_retries(
&remote_client,
ListingMode::NoDelimiter,
&tenant_root
));
while let Some(listing) = objects_stream.next().await {
let listing = listing?;
for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) {
let key = obj.key.to_string();
let kind = LargeObjectKind::from_key(&key);
if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
continue;
}
objects.push(LargeObject {
key,
size: obj.size.unwrap() as u64,
size: obj.size,
kind,
})
}
total_objects_ctr += fetch_response.contents().len() as u64;
match fetch_response.next_continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
}
total_objects_ctr += listing.keys.len() as u64;
}
Ok((tenant_shard_id, objects, total_objects_ctr))

View File

@@ -5,6 +5,7 @@
use std::{
collections::{HashMap, HashSet},
sync::Arc,
time::Duration,
};
use anyhow::Context;
@@ -18,8 +19,8 @@ use utils::id::TenantId;
use crate::{
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
init_remote, init_remote_generic,
metadata_stream::{stream_tenant_timelines, stream_tenants},
init_remote_generic, list_objects_with_retries_generic,
metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
};
@@ -27,6 +28,11 @@ use crate::{
enum GarbageReason {
DeletedInConsole,
MissingInConsole,
// The remaining data relates to a known deletion issue, and we're sure that purging this
// will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where
// there is nothing in a tenant path apart from a heatmap file.
KnownBug,
}
#[derive(Serialize, Deserialize, Debug)]
@@ -72,6 +78,15 @@ impl GarbageList {
}
}
/// If an entity has been identified as requiring purge due to a known bug, e.g.
/// a particular type of object left behind after an incomplete deletion.
fn append_buggy(&mut self, entity: GarbageEntity) {
self.items.push(GarbageItem {
entity,
reason: GarbageReason::KnownBug,
});
}
/// Return true if appended, false if not. False means the result was not garbage.
fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
where
@@ -138,7 +153,7 @@ async fn find_garbage_inner(
node_kind: NodeKind,
) -> anyhow::Result<GarbageList> {
// Construct clients for S3 and for Console API
let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
// Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -164,7 +179,7 @@ async fn find_garbage_inner(
// Enumerate Tenants in S3, and check if each one exists in Console
tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
let tenants = stream_tenants(&s3_client, &target);
let tenants = stream_tenants_generic(&remote_client, &target);
let tenants_checked = tenants.map_ok(|t| {
let api_client = cloud_admin_api_client.clone();
let console_cache = console_cache.clone();
@@ -219,6 +234,66 @@ async fn find_garbage_inner(
assert!(project.tenant == tenant_shard_id.tenant_id);
}
// Special case: If it's missing in console, check for known bugs that would enable us to conclusively
// identify it as purge-able anyway
if console_result.is_none() {
let timelines =
stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
.await?
.collect::<Vec<_>>()
.await;
if timelines.is_empty() {
// No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
let tenant_objects = list_objects_with_retries_generic(
&remote_client,
ListingMode::WithDelimiter,
&target.tenant_root(&tenant_shard_id),
)
.await?;
let object = tenant_objects.keys.first().unwrap();
if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
continue;
} else {
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
}
} else {
// A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
// rollout of WAL DR in which we never deleted these.
let mut any_non_initdb = false;
for timeline_r in timelines {
let timeline = timeline_r?;
let timeline_objects = list_objects_with_retries_generic(
&remote_client,
ListingMode::WithDelimiter,
&target.timeline_root(&timeline),
)
.await?;
if !timeline_objects.prefixes.is_empty() {
// Sub-paths? Unexpected
any_non_initdb = true;
} else {
let object = timeline_objects.keys.first().unwrap();
if object.key.get_path().as_str().ends_with("initdb.tar.zst") {
tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
} else {
any_non_initdb = true;
}
}
}
if any_non_initdb {
tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
} else {
tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
continue;
}
}
}
if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
tracing::debug!("Tenant {tenant_shard_id} is garbage");
} else {
@@ -256,7 +331,8 @@ async fn find_garbage_inner(
// Construct a stream of all timelines within active tenants
let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
let timelines =
active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
let timelines = timelines.try_flatten();
@@ -349,9 +425,6 @@ pub async fn get_timeline_objects(
tracing::debug!("Listing objects in timeline {ttid}");
let timeline_root = super::remote_timeline_path_id(&ttid);
// TODO: apply extra validation based on object modification time. Don't purge
// timelines whose index_part.json has been touched recently.
let list = s3_client
.list(
Some(&timeline_root),
@@ -422,6 +495,7 @@ impl DeletionProgressTracker {
pub async fn purge_garbage(
input_path: String,
mode: PurgeMode,
min_age: Duration,
dry_run: bool,
) -> anyhow::Result<()> {
let list_bytes = tokio::fs::read(&input_path).await?;
@@ -432,7 +506,7 @@ pub async fn purge_garbage(
input_path
);
let remote_client =
let (remote_client, _target) =
init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
assert_eq!(
@@ -459,6 +533,7 @@ pub async fn purge_garbage(
.filter(|i| match (&mode, &i.reason) {
(PurgeMode::DeletedAndMissing, _) => true,
(PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
(PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true,
(PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
});
@@ -487,6 +562,37 @@ pub async fn purge_garbage(
let mut progress_tracker = DeletionProgressTracker::default();
while let Some(result) = get_objects_results.next().await {
let mut object_list = result?;
// Extra safety check: even if a collection of objects is garbage, check max() of modification
// times before purging, so that if we incorrectly marked a live tenant as garbage then we would
// notice that its index has been written recently and would omit deleting it.
if object_list.is_empty() {
// Simplify subsequent code by ensuring list always has at least one item
// Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes
continue;
}
let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap();
let age = max_mtime.elapsed();
match age {
Err(_) => {
tracing::warn!("Bad last_modified time");
continue;
}
Ok(a) if a < min_age => {
// Failed age check. This doesn't mean we did something wrong: a tenant might really be garbage and recently
// written, but out of an abundance of caution we still don't purge it.
tracing::info!(
"Skipping tenant with young objects {}..{}",
object_list.first().as_ref().unwrap().key,
object_list.last().as_ref().unwrap().key
);
continue;
}
Ok(_) => {
// Passed age check
}
}
objects_to_delete.append(&mut object_list);
if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
do_delete(

View File

@@ -16,22 +16,26 @@ use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, Context};
use aws_config::retry::{RetryConfigBuilder, RetryMode};
use aws_sdk_s3::config::Region;
use aws_sdk_s3::error::DisplayErrorContext;
use aws_sdk_s3::Client;
use camino::{Utf8Path, Utf8PathBuf};
use clap::ValueEnum;
use futures::{Stream, StreamExt};
use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
use pageserver::tenant::TENANTS_SEGMENT_NAME;
use pageserver_api::shard::TenantShardId;
use remote_storage::{
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind,
S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
};
use reqwest::Url;
use serde::{Deserialize, Serialize};
use storage_controller_client::control_api;
use tokio::io::AsyncReadExt;
use tokio_util::sync::CancellationToken;
use tracing::error;
use tracing_appender::non_blocking::WorkerGuard;
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
@@ -253,6 +257,12 @@ pub struct ControllerClientConfig {
pub controller_jwt: String,
}
impl ControllerClientConfig {
pub fn build_client(self) -> control_api::Client {
control_api::Client::new(self.controller_api, Some(self.controller_jwt))
}
}
pub struct ConsoleConfig {
pub token: String,
pub base_url: Url,
@@ -305,8 +315,15 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
}
async fn init_s3_client(bucket_region: Region) -> Client {
let mut retry_config_builder = RetryConfigBuilder::new();
retry_config_builder
.set_max_attempts(Some(3))
.set_mode(Some(RetryMode::Adaptive));
let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
.region(bucket_region)
.retry_config(retry_config_builder.build())
.load()
.await;
Client::new(&config)
@@ -319,27 +336,35 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
}
}
fn make_root_target(
bucket_name: String,
prefix_in_bucket: String,
node_kind: NodeKind,
) -> RootTarget {
let s3_target = S3Target {
bucket_name,
prefix_in_bucket,
delimiter: "/".to_string(),
};
match node_kind {
NodeKind::Pageserver => RootTarget::Pageserver(s3_target),
NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target),
}
}
async fn init_remote(
bucket_config: BucketConfig,
node_kind: NodeKind,
) -> anyhow::Result<(Arc<Client>, RootTarget)> {
let bucket_region = Region::new(bucket_config.region);
let delimiter = "/".to_string();
let s3_client = Arc::new(init_s3_client(bucket_region).await);
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
let s3_root = match node_kind {
NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
bucket_name: bucket_config.bucket,
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
delimiter,
}),
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
bucket_name: bucket_config.bucket,
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
delimiter,
}),
};
let s3_root = make_root_target(
bucket_config.bucket,
bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
node_kind,
);
Ok((s3_client, s3_root))
}
@@ -347,12 +372,12 @@ async fn init_remote(
async fn init_remote_generic(
bucket_config: BucketConfig,
node_kind: NodeKind,
) -> anyhow::Result<GenericRemoteStorage> {
) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
let endpoint = env::var("AWS_ENDPOINT_URL").ok();
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
let storage = S3Config {
bucket_name: bucket_config.bucket,
bucket_name: bucket_config.bucket.clone(),
bucket_region: bucket_config.region,
prefix_in_bucket,
endpoint,
@@ -366,7 +391,13 @@ async fn init_remote_generic(
storage: RemoteStorageKind::AwsS3(storage),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};
GenericRemoteStorage::from_config(&storage_config).await
// We already pass the prefix to the remote client above
let prefix_in_root_target = String::new();
let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
let client = GenericRemoteStorage::from_config(&storage_config).await?;
Ok((client, s3_root))
}
async fn list_objects_with_retries(
@@ -404,6 +435,84 @@ async fn list_objects_with_retries(
Err(anyhow!("unreachable unless MAX_RETRIES==0"))
}
/// Listing possibly large amounts of keys in a streaming fashion.
fn stream_objects_with_retries<'a>(
storage_client: &'a GenericRemoteStorage,
listing_mode: ListingMode,
s3_target: &'a S3Target,
) -> impl Stream<Item = Result<Listing, anyhow::Error>> + 'a {
async_stream::stream! {
let mut trial = 0;
let cancel = CancellationToken::new();
let prefix_str = &s3_target
.prefix_in_bucket
.strip_prefix("/")
.unwrap_or(&s3_target.prefix_in_bucket);
let prefix = RemotePath::from_string(prefix_str)?;
let mut list_stream =
storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
while let Some(res) = list_stream.next().await {
if let Err(err) = res {
let yield_err = if err.is_permanent() {
true
} else {
let backoff_time = 1 << trial.max(5);
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
trial += 1;
trial == MAX_RETRIES - 1
};
if yield_err {
yield Err(err)
.with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
break;
}
} else {
trial = 0;
yield res.map_err(anyhow::Error::from);
}
}
}
}
/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
/// use [`stream_objects_with_retries`] instead.
async fn list_objects_with_retries_generic(
remote_client: &GenericRemoteStorage,
listing_mode: ListingMode,
s3_target: &S3Target,
) -> anyhow::Result<Listing> {
let cancel = CancellationToken::new();
let prefix_str = &s3_target
.prefix_in_bucket
.strip_prefix("/")
.unwrap_or(&s3_target.prefix_in_bucket);
let prefix = RemotePath::from_string(prefix_str)?;
for trial in 0..MAX_RETRIES {
match remote_client
.list(Some(&prefix), listing_mode, None, &cancel)
.await
{
Ok(response) => return Ok(response),
Err(e) => {
if trial == MAX_RETRIES - 1 {
return Err(e)
.with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
}
error!(
"list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
s3_target.bucket_name,
s3_target.prefix_in_bucket,
s3_target.delimiter,
DisplayErrorContext(e),
);
let backoff_time = 1 << trial.max(5);
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
}
}
}
panic!("MAX_RETRIES is not allowed to be 0");
}
async fn download_object_with_retries(
s3_client: &Client,
bucket_name: &str,

View File

@@ -1,7 +1,8 @@
use anyhow::{anyhow, bail};
use camino::Utf8PathBuf;
use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
use pageserver_api::shard::TenantShardId;
use reqwest::Url;
use reqwest::{Method, Url};
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
use storage_scrubber::pageserver_physical_gc::GcMode;
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -16,6 +17,11 @@ use storage_scrubber::{
use clap::{Parser, Subcommand};
use utils::id::TenantId;
use utils::{project_build_tag, project_git_version};
project_git_version!(GIT_VERSION);
project_build_tag!(BUILD_TAG);
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
#[command(arg_required_else_help(true))]
@@ -50,6 +56,8 @@ enum Command {
input_path: String,
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
mode: PurgeMode,
#[arg(long = "min-age")]
min_age: humantime::Duration,
},
#[command(verbatim_doc_comment)]
ScanMetadata {
@@ -59,6 +67,8 @@ enum Command {
json: bool,
#[arg(long = "tenant-id", num_args = 0..)]
tenant_ids: Vec<TenantShardId>,
#[arg(long = "post", default_value_t = false)]
post_to_storage_controller: bool,
#[arg(long, default_value = None)]
/// For safekeeper node_kind only, points to db with debug dump
dump_db_connstr: Option<String>,
@@ -96,6 +106,8 @@ enum Command {
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
let bucket_config = BucketConfig::from_env()?;
let command_log_name = match &cli.command {
@@ -114,11 +126,20 @@ async fn main() -> anyhow::Result<()> {
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
));
let controller_client_conf = cli.controller_api.map(|controller_api| {
ControllerClientConfig {
controller_api,
// Default to no key: this is a convenience when working in a development environment
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
}
});
match cli.command {
Command::ScanMetadata {
json,
tenant_ids,
node_kind,
post_to_storage_controller,
dump_db_connstr,
dump_db_table,
} => {
@@ -157,6 +178,9 @@ async fn main() -> anyhow::Result<()> {
}
Ok(())
} else {
if controller_client_conf.is_none() && post_to_storage_controller {
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
}
match scan_metadata(bucket_config.clone(), tenant_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
@@ -168,6 +192,21 @@ async fn main() -> anyhow::Result<()> {
} else {
println!("{}", summary.summary_string());
}
if post_to_storage_controller {
if let Some(conf) = controller_client_conf {
let controller_client = conf.build_client();
let body = summary.build_health_update_request();
controller_client
.dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
Method::POST,
"control/v1/metadata_health/update".to_string(),
Some(body),
)
.await?;
}
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
@@ -196,9 +235,11 @@ async fn main() -> anyhow::Result<()> {
let console_config = ConsoleConfig::from_env()?;
find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
}
Command::PurgeGarbage { input_path, mode } => {
purge_garbage(input_path, mode, !cli.delete).await
}
Command::PurgeGarbage {
input_path,
mode,
min_age,
} => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await,
Command::TenantSnapshot {
tenant_id,
output_path,
@@ -213,14 +254,6 @@ async fn main() -> anyhow::Result<()> {
min_age,
mode,
} => {
let controller_client_conf = cli.controller_api.map(|controller_api| {
ControllerClientConfig {
controller_api,
// Default to no key: this is a convenience when working in a development environment
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
}
});
match (&controller_client_conf, mode) {
(Some(_), _) => {
// Any mode may run when controller API is set

View File

@@ -1,12 +1,41 @@
use anyhow::Context;
use std::str::FromStr;
use anyhow::{anyhow, Context};
use async_stream::{stream, try_stream};
use aws_sdk_s3::{types::ObjectIdentifier, Client};
use futures::StreamExt;
use remote_storage::{GenericRemoteStorage, ListingMode};
use tokio_stream::Stream;
use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
use crate::{
list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
TenantShardTimelineId,
};
use pageserver_api::shard::TenantShardId;
use utils::id::{TenantId, TimelineId};
/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
pub fn stream_tenants_generic<'a>(
remote_client: &'a GenericRemoteStorage,
target: &'a RootTarget,
) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
try_stream! {
let tenants_target = target.tenants_root();
let mut tenants_stream =
std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
while let Some(chunk) = tenants_stream.next().await {
let chunk = chunk?;
let entry_ids = chunk.prefixes.iter()
.map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'")));
for dir_name_res in entry_ids {
let dir_name = dir_name_res?;
let id = TenantShardId::from_str(dir_name)?;
yield id;
}
}
}
}
/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
pub fn stream_tenants<'a>(
s3_client: &'a Client,
@@ -160,6 +189,63 @@ pub async fn stream_tenant_timelines<'a>(
})
}
/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
/// using a listing. The listing is done before the stream is built, so that this
/// function can be used to generate concurrency on a stream using buffer_unordered.
pub async fn stream_tenant_timelines_generic<'a>(
remote_client: &'a GenericRemoteStorage,
target: &'a RootTarget,
tenant: TenantShardId,
) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
let timelines_target = target.timelines_root(&tenant);
let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
remote_client,
ListingMode::WithDelimiter,
&timelines_target
));
loop {
tracing::debug!("Listing in {tenant}");
let fetch_response = match objects_stream.next().await {
None => break,
Some(Err(e)) => {
timeline_ids.push(Err(e));
break;
}
Some(Ok(r)) => r,
};
let new_entry_ids = fetch_response
.prefixes
.iter()
.filter_map(|prefix| -> Option<&str> {
prefix
.get_path()
.as_str()
.strip_prefix(&timelines_target.prefix_in_bucket)?
.strip_suffix('/')
})
.map(|entry_id_str| {
entry_id_str
.parse::<TimelineId>()
.with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
});
for i in new_entry_ids {
timeline_ids.push(i);
}
}
tracing::debug!("Yielding for {}", tenant);
Ok(stream! {
for i in timeline_ids {
let id = i?;
yield Ok(TenantShardTimelineId::new(tenant, id));
}
})
}
pub(crate) fn stream_listing<'a>(
s3_client: &'a Client,
target: &'a S3Target,

View File

@@ -567,13 +567,7 @@ pub async fn pageserver_physical_gc(
}
// Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
let ControllerClientConfig {
controller_api,
controller_jwt,
} = c;
control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
}) else {
let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
return Ok(summary);
};

View File

@@ -9,12 +9,13 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
use aws_sdk_s3::Client;
use futures_util::{StreamExt, TryStreamExt};
use pageserver::tenant::remote_timeline_client::remote_layer_path;
use pageserver_api::controller_api::MetadataHealthUpdateRequest;
use pageserver_api::shard::TenantShardId;
use serde::Serialize;
use utils::id::TenantId;
use utils::shard::ShardCount;
#[derive(Serialize)]
#[derive(Serialize, Default)]
pub struct MetadataSummary {
tenant_count: usize,
timeline_count: usize,
@@ -23,19 +24,16 @@ pub struct MetadataSummary {
with_warnings: HashSet<TenantShardTimelineId>,
with_orphans: HashSet<TenantShardTimelineId>,
indices_by_version: HashMap<usize, usize>,
#[serde(skip)]
pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
#[serde(skip)]
pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
}
impl MetadataSummary {
fn new() -> Self {
Self {
tenant_count: 0,
timeline_count: 0,
timeline_shard_count: 0,
with_errors: HashSet::new(),
with_warnings: HashSet::new(),
with_orphans: HashSet::new(),
indices_by_version: HashMap::new(),
}
Self::default()
}
fn update_data(&mut self, data: &S3TimelineBlobData) {
@@ -54,6 +52,13 @@ impl MetadataSummary {
}
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
if analysis.is_healthy() {
self.healthy_tenant_shards.insert(id.tenant_shard_id);
} else {
self.healthy_tenant_shards.remove(&id.tenant_shard_id);
self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
}
if !analysis.errors.is_empty() {
self.with_errors.insert(*id);
}
@@ -101,6 +106,13 @@ Index versions: {version_summary}
pub fn is_empty(&self) -> bool {
self.timeline_shard_count == 0
}
pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
MetadataHealthUpdateRequest {
healthy_tenant_shards: self.healthy_tenant_shards.clone(),
unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
}
}
}
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.

View File

@@ -150,6 +150,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_pitr_history_size",
"pageserver_layer_bytes",
"pageserver_layer_count",
"pageserver_visible_physical_size",
"pageserver_storage_operations_seconds_count_total",
"pageserver_storage_operations_seconds_sum_total",
"pageserver_evictions_total",

View File

@@ -449,6 +449,7 @@ class TokenScope(str, Enum):
GENERATIONS_API = "generations_api"
SAFEKEEPER_DATA = "safekeeperdata"
TENANT = "tenant"
SCRUBBER = "scrubber"
class NeonEnvBuilder:
@@ -1942,11 +1943,15 @@ class NeonCli(AbstractNeonCli):
remote_ext_config: Optional[str] = None,
pageserver_id: Optional[int] = None,
allow_multiple=False,
basebackup_request_tries: Optional[int] = None,
) -> "subprocess.CompletedProcess[str]":
args = [
"endpoint",
"start",
]
extra_env_vars = {}
if basebackup_request_tries is not None:
extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries)
if remote_ext_config is not None:
args.extend(["--remote-ext-config", remote_ext_config])
@@ -1959,7 +1964,7 @@ class NeonCli(AbstractNeonCli):
if allow_multiple:
args.extend(["--allow-multiple"])
res = self.raw_cli(args)
res = self.raw_cli(args, extra_env_vars)
res.check_returncode()
return res
@@ -2586,6 +2591,51 @@ class NeonStorageController(MetricsGetter, LogUtils):
time.sleep(backoff)
def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]):
body: Dict[str, Any] = {
"healthy_tenant_shards": [str(t) for t in healthy],
"unhealthy_tenant_shards": [str(t) for t in unhealthy],
}
self.request(
"POST",
f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
json=body,
headers=self.headers(TokenScope.SCRUBBER),
)
def metadata_health_list_unhealthy(self):
response = self.request(
"GET",
f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
def metadata_health_list_outdated(self, duration: str):
body: Dict[str, Any] = {"not_scrubbed_for": duration}
response = self.request(
"POST",
f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
json=body,
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool:
"""Metadata is healthy if there is no unhealthy or outdated health records."""
unhealthy = self.metadata_health_list_unhealthy()
outdated = self.metadata_health_list_outdated(outdated_duration)
healthy = (
len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0
)
if not healthy:
log.info(f"{unhealthy=}, {outdated=}")
return healthy
def step_down(self):
log.info("Asking storage controller to step down")
response = self.request(
@@ -3766,6 +3816,7 @@ class Endpoint(PgProtocol, LogUtils):
pageserver_id: Optional[int] = None,
safekeepers: Optional[List[int]] = None,
allow_multiple: bool = False,
basebackup_request_tries: Optional[int] = None,
) -> "Endpoint":
"""
Start the Postgres instance.
@@ -3787,6 +3838,7 @@ class Endpoint(PgProtocol, LogUtils):
remote_ext_config=remote_ext_config,
pageserver_id=pageserver_id,
allow_multiple=allow_multiple,
basebackup_request_tries=basebackup_request_tries,
)
self._running.release(1)
@@ -3933,6 +3985,7 @@ class Endpoint(PgProtocol, LogUtils):
remote_ext_config: Optional[str] = None,
pageserver_id: Optional[int] = None,
allow_multiple=False,
basebackup_request_tries: Optional[int] = None,
) -> "Endpoint":
"""
Create an endpoint, apply config, and start Postgres.
@@ -3953,6 +4006,7 @@ class Endpoint(PgProtocol, LogUtils):
remote_ext_config=remote_ext_config,
pageserver_id=pageserver_id,
allow_multiple=allow_multiple,
basebackup_request_tries=basebackup_request_tries,
)
log.info(f"Postgres startup took {time.time() - started_at} seconds")
@@ -3996,6 +4050,7 @@ class EndpointFactory:
config_lines: Optional[List[str]] = None,
remote_ext_config: Optional[str] = None,
pageserver_id: Optional[int] = None,
basebackup_request_tries: Optional[int] = None,
) -> Endpoint:
ep = Endpoint(
self.env,
@@ -4014,6 +4069,7 @@ class EndpointFactory:
lsn=lsn,
remote_ext_config=remote_ext_config,
pageserver_id=pageserver_id,
basebackup_request_tries=basebackup_request_tries,
)
def create(
@@ -4355,10 +4411,11 @@ class StorageScrubber:
assert stdout is not None
return stdout
def scan_metadata(self) -> Any:
stdout = self.scrubber_cli(
["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
)
def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
if post_to_storage_controller:
args.append("--post")
stdout = self.scrubber_cli(args, timeout=30)
try:
return json.loads(stdout)
@@ -4482,6 +4539,13 @@ def test_output_dir(
yield test_dir
# Allure artifacts creation might involve the creation of `.tar.zst` archives,
# which aren't going to be used if Allure results collection is not enabled
# (i.e. --alluredir is not set).
# Skip `allure_attach_from_dir` in this case
if not request.config.getoption("--alluredir"):
return
preserve_database_files = False
for k, v in request.node.user_properties:
# NB: the neon_env_builder fixture uses this fixture (test_output_dir).

View File

@@ -663,6 +663,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
force_image_layer_creation=False,
wait_until_uploaded=False,
compact: Optional[bool] = None,
**kwargs,
):
self.is_testing_enabled_or_skip()
query = {}
@@ -680,6 +681,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
params=query,
**kwargs,
)
log.info(f"Got checkpoint request response code: {res.status_code}")
self.verbose_error(res)

View File

@@ -0,0 +1,88 @@
"""
Test the logical replication in Neon with the different consumers
"""
import hashlib
import time
import clickhouse_connect
import psycopg2
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import RemotePostgres
from fixtures.utils import wait_until
def query_clickhouse(
client,
query: str,
digest: str,
) -> None:
"""
Run the query on the client
return answer if successful, raise an exception otherwise
"""
log.debug("Query: %s", query)
res = client.query(query)
log.debug(res.result_rows)
m = hashlib.sha1()
m.update(repr(tuple(res.result_rows)).encode())
hash_res = m.hexdigest()
log.debug("Hash: %s", hash_res)
if hash_res == digest:
return
raise ValueError("Hash mismatch")
@pytest.mark.remote_cluster
def test_clickhouse(remote_pg: RemotePostgres):
"""
Test the logical replication having ClickHouse as a client
"""
conn_options = remote_pg.conn_options()
for _ in range(5):
try:
conn = psycopg2.connect(remote_pg.connstr())
except psycopg2.OperationalError as perr:
log.debug(perr)
time.sleep(1)
else:
break
raise TimeoutError
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS table1")
cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
conn.commit()
client = clickhouse_connect.get_client(host="clickhouse")
client.command("SET allow_experimental_database_materialized_postgresql=1")
client.command(
"CREATE DATABASE db1_postgres ENGINE = "
f"MaterializedPostgreSQL('{conn_options['host']}', "
f"'{conn_options['dbname']}', "
f"'{conn_options['user']}', '{conn_options['password']}') "
"SETTINGS materialized_postgresql_tables_list = 'table1';"
)
wait_until(
120,
0.5,
lambda: query_clickhouse(
client,
"select * from db1_postgres.table1 order by 1",
"ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
),
)
cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
conn.commit()
wait_until(
120,
0.5,
lambda: query_clickhouse(
client,
"select * from db1_postgres.table1 order by 1",
"9eba2daaf7e4d7d27ac849525f68b562ab53947d",
),
)
log.debug("Sleeping before final checking if Neon is still alive")
time.sleep(3)
cur.execute("SELECT 1")

View File

@@ -6,21 +6,8 @@ from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
@pytest.mark.timeout(10000)
def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
"""
Test that GC is able to collect all old layers even if them are forming
"stairs" and there are not three delta layers since last image layer.
Information about image layers needed to collect old layers should
be propagated by GC to compaction task which should take in in account
when make a decision which new image layers needs to be created.
NB: this test demonstrates the problem. The source tree contained the
`gc_feedback` mechanism for about 9 months, but, there were problems
with it and it wasn't enabled at runtime.
This PR removed the code: https://github.com/neondatabase/neon/pull/6863
"""
def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str):
assert mode == "normal" or mode == "with_snapshots"
env = neon_env_builder.init_start()
client = env.pageserver.http_client()
@@ -74,6 +61,9 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
log.info(f"Physical storage size {physical_size}")
if mode == "with_snapshots":
if step == n_steps / 2:
env.neon_cli.create_branch("child")
max_num_of_deltas_above_image = 0
max_total_num_of_deltas = 0
@@ -149,3 +139,37 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
log.info(f"Writing layer map to {layer_map_path}")
with layer_map_path.open("w") as f:
f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
@pytest.mark.timeout(10000)
def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
"""
Test that GC is able to collect all old layers even if them are forming
"stairs" and there are not three delta layers since last image layer.
Information about image layers needed to collect old layers should
be propagated by GC to compaction task which should take in in account
when make a decision which new image layers needs to be created.
NB: this test demonstrates the problem. The source tree contained the
`gc_feedback` mechanism for about 9 months, but, there were problems
with it and it wasn't enabled at runtime.
This PR removed the code: https://github.com/neondatabase/neon/pull/6863
And the bottom-most GC-compaction epic resolves the problem.
https://github.com/neondatabase/neon/issues/8002
"""
gc_feedback_impl(neon_env_builder, zenbenchmark, "normal")
@pytest.mark.timeout(10000)
def test_gc_feedback_with_snapshots(
neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
):
"""
Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle
of the benchmark, and the bottom-most compaction should collect as much garbage as possible below the GC
horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point,
and images covering the full key range (in a delta layer) at the GC horizon.
"""
gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots")

View File

@@ -18,7 +18,6 @@ from fixtures.pageserver.utils import wait_until_tenant_active
from fixtures.utils import query_scalar
from performance.test_perf_pgbench import get_scales_matrix
from requests import RequestException
from requests.exceptions import RetryError
# Test branch creation
@@ -151,7 +150,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
env.pageserver.allowed_errors.extend(
[
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading",
".*page_service_conn_main.*: query handler for 'basebackup .* ERROR: Not found: Timeline",
]
)
ps_http = env.pageserver.http_client()
@@ -176,10 +175,12 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
with pytest.raises(RuntimeError, match="is not active, state: Loading"):
env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
env.endpoints.create_start(
initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
)
ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
finally:
# FIXME: paused uploads bother shutdown
env.pageserver.stop(immediate=True)
t.join()
@@ -193,8 +194,11 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
env = neon_env_builder.init_configs()
env.start()
env.pageserver.allowed_errors.append(
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
env.pageserver.allowed_errors.extend(
[
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: .*Cannot branch off the timeline that's not present in pageserver.*",
]
)
ps_http = env.pageserver.http_client()
@@ -216,7 +220,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
branch_id = TimelineId.generate()
with pytest.raises(RetryError, match="too many 503 error responses"):
with pytest.raises(
PageserverApiException,
match="Cannot branch off the timeline that's not present in pageserver",
):
ps_http.timeline_create(
env.pg_version,
env.initial_tenant,
@@ -389,6 +396,11 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
repeat_result = ps_http.timeline_create(
env.pg_version, env.initial_tenant, success_timeline, timeout=60
)
# remote_consistent_lsn_visible will be published only after we've
# confirmed the generation, which is not part of what we await during
# timeline creation (uploads). mask it out here to avoid flakyness.
del success_result["remote_consistent_lsn_visible"]
del repeat_result["remote_consistent_lsn_visible"]
assert repeat_result == success_result
finally:
env.pageserver.stop(immediate=True)

View File

@@ -3,18 +3,15 @@ import re
import shutil
import subprocess
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional
import pytest
import toml
from fixtures.common_types import Lsn
from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
)
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
@@ -22,7 +19,8 @@ from fixtures.pageserver.utils import (
wait_for_upload,
)
from fixtures.pg_version import PgVersion
from fixtures.remote_storage import RemoteStorageKind
from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
from fixtures.workload import Workload
#
# A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
@@ -409,3 +407,133 @@ def dump_differs(
break
return differs
@dataclass
class HistoricDataSet:
name: str
tenant_id: TenantId
pg_version: PgVersion
url: str
def __str__(self):
return self.name
HISTORIC_DATA_SETS = [
# From before we enabled image layer compression.
# - IndexPart::LATEST_VERSION 7
# - STORAGE_FORMAT_VERSION 3
HistoricDataSet(
"2024-07-18",
TenantId("17bf64a53509714687664b3a84e9b3ba"),
PgVersion.V16,
"https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst",
),
]
@pytest.mark.parametrize("dataset", HISTORIC_DATA_SETS)
@pytest.mark.xdist_group("compatibility")
def test_historic_storage_formats(
neon_env_builder: NeonEnvBuilder,
test_output_dir: Path,
pg_version: PgVersion,
dataset: HistoricDataSet,
):
"""
This test is like test_backward_compatibility, but it looks back further to examples of our storage format from long ago.
"""
ARTIFACT_CACHE_DIR = "./artifact_cache"
import tarfile
from contextlib import closing
import requests
import zstandard
artifact_unpack_path = ARTIFACT_CACHE_DIR / Path("unpacked") / Path(dataset.name)
# Note: we assume that when running across a matrix of PG versions, the matrix includes all the versions needed by
# HISTORIC_DATA_SETS. If we ever remove a PG version from the matrix, then historic datasets built using that version
# will no longer be covered by this test.
if pg_version != dataset.pg_version:
pytest.skip(f"Dataset {dataset} is for different PG version, skipping")
with closing(requests.get(dataset.url, stream=True)) as r:
unzstd = zstandard.ZstdDecompressor()
with unzstd.stream_reader(r.raw) as stream:
with tarfile.open(mode="r|", fileobj=stream) as tf:
tf.extractall(artifact_unpack_path)
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.pg_version = dataset.pg_version
env = neon_env_builder.init_configs()
env.start()
assert isinstance(env.pageserver_remote_storage, S3Storage)
# Link artifact data into test's remote storage. We don't want the whole repo dir, just the remote storage part: we are not testing
# compat of local disk data across releases (test_backward_compat does that), we're testing really long-lived data in S3 like layer files and indices.
#
# The code generating the snapshot uses local_fs, but this test uses S3Storage, so we are copying a tree of files into a bucket. We use
# S3Storage so that the scrubber can run (the scrubber doesn't speak local_fs)
artifact_pageserver_path = (
artifact_unpack_path / Path("repo") / Path("local_fs_remote_storage") / Path("pageserver")
)
for root, _dirs, files in os.walk(artifact_pageserver_path):
for file in files:
local_path = os.path.join(root, file)
remote_key = (
env.pageserver_remote_storage.prefix_in_bucket
+ str(local_path)[len(str(artifact_pageserver_path)) :]
)
log.info(f"Uploading {local_path} -> {remote_key}")
env.pageserver_remote_storage.client.upload_file(
local_path, env.pageserver_remote_storage.bucket_name, remote_key
)
# Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt)
#
# Do this _before_ importing to the pageserver, as that import may start writing immediately
metadata_summary = env.storage_scrubber.scan_metadata()
assert metadata_summary["tenant_count"] >= 1
assert metadata_summary["timeline_count"] >= 1
assert not metadata_summary["with_errors"]
assert not metadata_summary["with_warnings"]
env.neon_cli.import_tenant(dataset.tenant_id)
# Discover timelines
timelines = env.pageserver.http_client().timeline_list(dataset.tenant_id)
# All our artifacts should contain at least one timeline
assert len(timelines) > 0
# TODO: ensure that the snapshots we're importing contain a sensible variety of content, at the very
# least they should include a mixture of deltas and image layers. Preferably they should also
# contain some "exotic" stuff like aux files from logical replication.
# Check we can start an endpoint and read the SQL that the artifact is meant to contain
reference_sql_dump = artifact_unpack_path / Path("dump.sql")
ep = env.endpoints.create_start("main", tenant_id=dataset.tenant_id)
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
pg_bin.run_capture(
["pg_dumpall", f"--dbname={ep.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]
)
assert not dump_differs(
reference_sql_dump,
test_output_dir / "dump.sql",
test_output_dir / "dump.filediff",
)
ep.stop()
# Check we can also do writes to the database
existing_timeline_id = TimelineId(timelines[0]["timeline_id"])
workload = Workload(env, dataset.tenant_id, existing_timeline_id)
workload.init()
workload.write_rows(100)
# Check that compaction works
env.pageserver.http_client().timeline_compact(
dataset.tenant_id, existing_timeline_id, force_image_layer_creation=True
)

View File

@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.common_types import parse_layer_file_name
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
@@ -313,6 +312,7 @@ def test_remote_storage_upload_queue_retries(
def churn_while_failpoints_active(result):
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c")
# this call will wait for the failpoints to be turned off
client.timeline_checkpoint(tenant_id, timeline_id)
client.timeline_compact(tenant_id, timeline_id)
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d")
@@ -332,8 +332,8 @@ def test_remote_storage_upload_queue_retries(
# Exponential back-off in upload queue, so, gracious timeouts.
wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1))
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
# unblock churn operations
configure_storage_sync_failpoints("off")
@@ -769,11 +769,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
create_thread.join()
def test_compaction_waits_for_upload(
def test_paused_upload_stalls_checkpoint(
neon_env_builder: NeonEnvBuilder,
):
"""
This test forces a race between upload and compaction.
This test checks that checkpoints block on uploads to remote storage.
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
@@ -788,6 +788,10 @@ def test_compaction_waits_for_upload(
}
)
env.pageserver.allowed_errors.append(
f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
@@ -808,76 +812,9 @@ def test_compaction_waits_for_upload(
endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
client.timeline_checkpoint(tenant_id, timeline_id)
deltas_at_first = len(client.layer_map_info(tenant_id, timeline_id).delta_layers())
assert (
deltas_at_first == 2
), "are you fixing #5863? just add one more checkpoint after 'CREATE TABLE bar ...' statement."
endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)")
endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
layers_before_last_checkpoint = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
upload_stuck_layers = layers_before_last_checkpoint - layers_at_creation.historic_by_name()
assert len(upload_stuck_layers) > 0
for name in upload_stuck_layers:
assert env.pageserver.layer_exists(
tenant_id, timeline_id, parse_layer_file_name(name)
), "while uploads are stuck the layers should be present on disk"
# now this will do the L0 => L1 compaction and want to remove
# upload_stuck_layers and the original initdb L0
client.timeline_checkpoint(tenant_id, timeline_id)
# as uploads are paused, the upload_stuck_layers should still be with us
for name in upload_stuck_layers:
assert env.pageserver.layer_exists(
tenant_id, timeline_id, parse_layer_file_name(name)
), "uploads are stuck still over compaction"
compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
overlap = compacted_layers.intersection(upload_stuck_layers)
assert len(overlap) == 0, "none of the L0's should remain after L0 => L1 compaction"
assert (
len(compacted_layers) == 1
), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)"
def layer_deletes_completed():
m = client.get_metric_value("pageserver_layer_completed_deletes_total")
if m is None:
return 0
return int(m)
# if initdb created an initial delta layer, it might already be gc'd
# because it was uploaded before the failpoint was enabled. however, the
# deletion is not guaranteed to be complete.
assert layer_deletes_completed() <= 1
client.configure_failpoints(("before-upload-layer-pausable", "off"))
# Ensure that this actually terminates
wait_upload_queue_empty(client, tenant_id, timeline_id)
def until_layer_deletes_completed():
deletes = layer_deletes_completed()
log.info(f"layer_deletes: {deletes}")
# ensure that initdb delta layer AND the previously stuck are now deleted
assert deletes >= len(upload_stuck_layers) + 1
wait_until(10, 1, until_layer_deletes_completed)
for name in upload_stuck_layers:
assert not env.pageserver.layer_exists(
tenant_id, timeline_id, parse_layer_file_name(name)
), "l0 should now be removed because of L0 => L1 compaction and completed uploads"
# We should not have hit the error handling path in uploads where a uploaded file is gone
assert not env.pageserver.log_contains(
"File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
)
with pytest.raises(ReadTimeout):
client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
client.configure_failpoints(("before-upload-layer-pausable", "off"))
def wait_upload_queue_empty(

View File

@@ -3,7 +3,7 @@ import threading
import time
from collections import defaultdict
from datetime import datetime, timezone
from typing import Any, Dict, List, Union
from typing import Any, Dict, List, Optional, Set, Tuple, Union
import pytest
from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -1785,6 +1785,126 @@ def test_storage_controller_node_deletion(
env.storage_controller.consistency_check()
@pytest.mark.parametrize("shard_count", [None, 2])
def test_storage_controller_metadata_health(
neon_env_builder: NeonEnvBuilder,
shard_count: Optional[int],
):
"""
Create three tenants A, B, C.
Phase 1:
- A: Post healthy status.
- B: Post unhealthy status.
- C: No updates.
Phase 2:
- B: Post healthy status.
- C: Post healthy status.
Phase 3:
- A: Post unhealthy status.
Phase 4:
- Delete tenant A, metadata health status should be deleted as well.
"""
def update_and_query_metadata_health(
env: NeonEnv,
healthy: List[TenantShardId],
unhealthy: List[TenantShardId],
outdated_duration: str = "1h",
) -> Tuple[Set[str], Set[str]]:
"""
Update metadata health. Then list tenant shards with unhealthy and
outdated metadata health status.
"""
if healthy or unhealthy:
env.storage_controller.metadata_health_update(healthy, unhealthy)
result = env.storage_controller.metadata_health_list_unhealthy()
unhealthy_res = set(result["unhealthy_tenant_shards"])
result = env.storage_controller.metadata_health_list_outdated(outdated_duration)
outdated_res = set(record["tenant_shard_id"] for record in result["health_records"])
return unhealthy_res, outdated_res
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_start()
# Mock tenant (`initial_tenant``) with healthy scrubber scan result
tenant_a_shard_ids = (
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=shard_count)
if shard_count is not None
else [TenantShardId(env.initial_tenant, 0, 0)]
)
# Mock tenant with unhealthy scrubber scan result
tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count)
tenant_b_shard_ids = (
env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count)
if shard_count is not None
else [TenantShardId(tenant_b, 0, 0)]
)
# Mock tenant that never gets a health update from scrubber
tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count)
tenant_c_shard_ids = (
env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count)
if shard_count is not None
else [TenantShardId(tenant_c, 0, 0)]
)
# Metadata health table also updated as tenant shards are created.
assert env.storage_controller.metadata_health_is_healthy()
# post "fake" updates to storage controller db
unhealthy, outdated = update_and_query_metadata_health(
env, healthy=tenant_a_shard_ids, unhealthy=tenant_b_shard_ids
)
log.info(f"After Phase 1: {unhealthy=}, {outdated=}")
assert len(unhealthy) == len(tenant_b_shard_ids)
for t in tenant_b_shard_ids:
assert str(t) in unhealthy
assert len(outdated) == 0
unhealthy, outdated = update_and_query_metadata_health(
env, healthy=tenant_b_shard_ids + tenant_c_shard_ids, unhealthy=[]
)
log.info(f"After Phase 2: {unhealthy=}, {outdated=}")
assert len(unhealthy) == 0
assert len(outdated) == 0
unhealthy, outdated = update_and_query_metadata_health(
env, healthy=[], unhealthy=tenant_a_shard_ids
)
log.info(f"After Phase 3: {unhealthy=}, {outdated=}")
assert len(unhealthy) == len(tenant_a_shard_ids)
for t in tenant_a_shard_ids:
assert str(t) in unhealthy
assert len(outdated) == 0
# Phase 4: Delete A
env.storage_controller.pageserver_api().tenant_delete(env.initial_tenant)
# A's unhealthy metadata health status should be deleted as well.
assert env.storage_controller.metadata_health_is_healthy()
# All shards from B and C are not fresh if set outdated duration to 0 seconds.
unhealthy, outdated = update_and_query_metadata_health(
env, healthy=[], unhealthy=tenant_a_shard_ids, outdated_duration="0s"
)
assert len(unhealthy) == 0
for t in tenant_b_shard_ids + tenant_c_shard_ids:
assert str(t) in outdated
def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
"""
Test the `/control/v1/step_down` storage controller API. Upon receiving such

View File

@@ -13,6 +13,7 @@ from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
)
from fixtures.pg_version import PgVersion
from fixtures.remote_storage import S3Storage, s3_storage
from fixtures.utils import wait_until
from fixtures.workload import Workload
@@ -265,10 +266,85 @@ def test_scrubber_physical_gc_ancestors(
# attach it, to drop any local state, then check it's still readable.
workload.stop()
drop_local_state(env, tenant_id)
workload.validate()
def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder):
"""
When we delete a timeline after a shard split, the child shards do not directly delete the
layers in the ancestor shards. They rely on the scrubber to clean up.
"""
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.start()
tenant_id = TenantId.generate()
timeline_id = TimelineId.generate()
env.neon_cli.create_tenant(
tenant_id,
timeline_id,
shard_count=None,
conf={
# Small layers and low compaction thresholds, so that when we split we can expect some to
# be dropped by child shards
"checkpoint_distance": f"{1024 * 1024}",
"compaction_threshold": "1",
"compaction_target_size": f"{1024 * 1024}",
"image_creation_threshold": "2",
"image_layer_creation_check_threshold": "0",
# Disable background compaction, we will do it explicitly
"compaction_period": "0s",
# No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
# and makes them GC'able
"pitr_interval": "0s",
},
)
# Make sure the original shard has some layers
workload = Workload(env, tenant_id, timeline_id)
workload.init()
workload.write_rows(100)
new_shard_count = 4
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
# Create a second timeline so that when we delete the first one, child shards still have some content in S3.
#
# This is a limitation of the scrubber: if a shard isn't in S3 (because it has no timelines), then the scrubber
# doesn't know about it, and won't perceive its ancestors as ancestors.
other_timeline_id = TimelineId.generate()
env.storage_controller.pageserver_api().timeline_create(
PgVersion.NOT_SET, tenant_id, other_timeline_id
)
# Write after split so that child shards have some indices in S3
workload.write_rows(100, upload=False)
for shard in shards:
ps = env.get_tenant_pageserver(shard)
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
ps.http_client().timeline_checkpoint(
shard, timeline_id, compact=False, wait_until_uploaded=True
)
# The timeline still exists in child shards and they reference its layers, so scrubbing
# now shouldn't delete anything.
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == 0
assert gc_summary["ancestor_layers_deleted"] == 0
# Delete the timeline
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id)
# Subsequently doing physical GC should clean up the ancestor layers
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
assert gc_summary["remote_storage_errors"] == 0
assert gc_summary["indices_deleted"] == 0
assert gc_summary["ancestor_layers_deleted"] > 0
def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
"""
Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards
@@ -440,10 +516,12 @@ def test_scrubber_scan_pageserver_metadata(
assert len(index.layer_metadata) > 0
it = iter(index.layer_metadata.items())
scan_summary = env.storage_scrubber.scan_metadata()
scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
assert not scan_summary["with_warnings"]
assert not scan_summary["with_errors"]
assert env.storage_controller.metadata_health_is_healthy()
# Delete a layer file that is listed in the index.
layer, metadata = next(it)
log.info(f"Deleting {timeline_path}/{layer.to_str()}")
@@ -453,7 +531,17 @@ def test_scrubber_scan_pageserver_metadata(
)
log.info(f"delete response: {delete_response}")
# Check scan summary. Expect it to be a L0 layer so only emit warnings.
# Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
scan_summary = env.storage_scrubber.scan_metadata()
log.info(f"{pprint.pformat(scan_summary)}")
assert len(scan_summary["with_warnings"]) > 0
assert env.storage_controller.metadata_health_is_healthy()
# Now post to storage controller, expect seeing one unhealthy health record
scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
log.info(f"{pprint.pformat(scan_summary)}")
assert len(scan_summary["with_warnings"]) > 0
unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)

View File

@@ -277,8 +277,12 @@ files:
help: 'Bytes between received and replayed LSN'
key_labels:
values: [replication_delay_bytes]
# We use a GREATEST call here because this calculation can be negative.
# The calculation is not atomic, meaning after we've gotten the receive
# LSN, the replay LSN may have advanced past the receive LSN we
# are using for the calculation.
query: |
SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes;
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
- metric_name: replication_delay_seconds
type: gauge