mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-19 06:00:38 +00:00
Compare commits
19 Commits
proxy-leak
...
fcdm/testc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f8ce169e2a | ||
|
|
d21246c8bd | ||
|
|
4825b0fec3 | ||
|
|
a4df3c8488 | ||
|
|
d95b46f3f3 | ||
|
|
85bef9f05d | ||
|
|
e374d6778e | ||
|
|
9ceaf9a986 | ||
|
|
f72fe68626 | ||
|
|
9fabdda2dc | ||
|
|
1c7b06c988 | ||
|
|
52b02d95c8 | ||
|
|
4be58522fb | ||
|
|
d09dad0ea2 | ||
|
|
5775662276 | ||
|
|
bdfc9ca7e9 | ||
|
|
1d8cf5b3a9 | ||
|
|
859f019185 | ||
|
|
da6bdff893 |
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -9,5 +9,6 @@ self-hosted-runner:
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER_NEW
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
|
||||
12
.github/actions/neon-project-create/action.yml
vendored
12
.github/actions/neon-project-create/action.yml
vendored
@@ -14,11 +14,8 @@ inputs:
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
provisioner:
|
||||
description: 'k8s-pod or k8s-neonvm'
|
||||
default: 'k8s-pod'
|
||||
compute_units:
|
||||
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
||||
description: '[Min, Max] compute units'
|
||||
default: '[1, 1]'
|
||||
|
||||
outputs:
|
||||
@@ -37,10 +34,6 @@ runs:
|
||||
# A shell without `set -x` to not to expose password/dsn in logs
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
|
||||
echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
|
||||
fi
|
||||
|
||||
project=$(curl \
|
||||
"https://${API_HOST}/api/v2/projects" \
|
||||
--fail \
|
||||
@@ -52,7 +45,7 @@ runs:
|
||||
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
|
||||
\"pg_version\": ${POSTGRES_VERSION},
|
||||
\"region_id\": \"${REGION_ID}\",
|
||||
\"provisioner\": \"${PROVISIONER}\",
|
||||
\"provisioner\": \"k8s-neonvm\",
|
||||
\"autoscaling_limit_min_cu\": ${MIN_CU},
|
||||
\"autoscaling_limit_max_cu\": ${MAX_CU},
|
||||
\"settings\": { }
|
||||
@@ -75,6 +68,5 @@ runs:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
REGION_ID: ${{ inputs.region_id }}
|
||||
POSTGRES_VERSION: ${{ inputs.postgres_version }}
|
||||
PROVISIONER: ${{ inputs.provisioner }}
|
||||
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
|
||||
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
|
||||
|
||||
14
.github/workflows/_build-and-test-locally.yml
vendored
14
.github/workflows/_build-and-test-locally.yml
vendored
@@ -19,6 +19,10 @@ on:
|
||||
description: 'debug or release'
|
||||
required: true
|
||||
type: string
|
||||
pg-versions:
|
||||
description: 'a json array of postgres versions to run regression tests on'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -219,9 +223,9 @@ jobs:
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV_NEW }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV_NEW }}"
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER_NEW }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
|
||||
@@ -254,7 +258,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pg_version: [ v14, v15, v16 ]
|
||||
pg_version: ${{ fromJson(inputs.pg-versions) }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -284,5 +288,5 @@ jobs:
|
||||
- name: Merge and upload coverage data
|
||||
if: |
|
||||
false &&
|
||||
inputs.build-type == 'debug' && matrix.pg_version == 'v14'
|
||||
inputs.build-type == 'debug' && matrix.pg_version == 'v16'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
52
.github/workflows/benchmarking.yml
vendored
52
.github/workflows/benchmarking.yml
vendored
@@ -63,11 +63,9 @@ jobs:
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "neon-staging"
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
provisioner: 'k8s-pod'
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "azure-staging"
|
||||
region_id: 'azure-eastus2'
|
||||
provisioner: 'k8s-neonvm'
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
@@ -100,7 +98,6 @@ jobs:
|
||||
region_id: ${{ matrix.region_id }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
provisioner: ${{ matrix.provisioner }}
|
||||
|
||||
- name: Run benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -216,11 +213,11 @@ jobs:
|
||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||
#
|
||||
# Available platforms:
|
||||
# - neon-captest-new: Freshly created project (1 CU)
|
||||
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
|
||||
# - neonvm-captest-new: Freshly created project (1 CU)
|
||||
# - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
|
||||
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
|
||||
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
|
||||
# - neon-captest-reuse: Reusing existing project
|
||||
# - neonvm-captest-reuse: Reusing existing project
|
||||
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
env:
|
||||
@@ -245,18 +242,16 @@ jobs:
|
||||
"'"$region_id_default"'"
|
||||
],
|
||||
"platform": [
|
||||
"neon-captest-new",
|
||||
"neon-captest-reuse",
|
||||
"neonvm-captest-new",
|
||||
"neonvm-captest-reuse",
|
||||
"neonvm-captest-new"
|
||||
],
|
||||
"db_size": [ "10gb" ],
|
||||
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||
}'
|
||||
|
||||
@@ -271,7 +266,7 @@ jobs:
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neon-captest-reuse"
|
||||
"neonvm-captest-reuse"
|
||||
]
|
||||
}'
|
||||
|
||||
@@ -287,7 +282,7 @@ jobs:
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neon-captest-reuse"
|
||||
"neonvm-captest-reuse"
|
||||
],
|
||||
"scale": [
|
||||
"10"
|
||||
@@ -338,7 +333,7 @@ jobs:
|
||||
prefix: latest
|
||||
|
||||
- name: Create Neon Project
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
|
||||
if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
@@ -346,19 +341,18 @@ jobs:
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
|
||||
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-reuse)
|
||||
neonvm-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
neonvm-captest-sharding-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
||||
;;
|
||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
|
||||
neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -442,9 +436,9 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- PLATFORM: "neon-captest-pgvector"
|
||||
- PLATFORM: "neonvm-captest-pgvector"
|
||||
- PLATFORM: "azure-captest-pgvector"
|
||||
|
||||
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "1"
|
||||
@@ -486,7 +480,7 @@ jobs:
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-pgvector)
|
||||
neonvm-captest-pgvector)
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||
;;
|
||||
azure-captest-pgvector)
|
||||
@@ -585,7 +579,7 @@ jobs:
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-reuse)
|
||||
neonvm-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -595,7 +589,7 @@ jobs:
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -672,7 +666,7 @@ jobs:
|
||||
- name: Get Connstring Secret Name
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-reuse)
|
||||
neonvm-captest-reuse)
|
||||
ENV_PLATFORM=CAPTEST_TPCH
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -682,7 +676,7 @@ jobs:
|
||||
ENV_PLATFORM=RDS_AURORA_TPCH
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -759,7 +753,7 @@ jobs:
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-reuse)
|
||||
neonvm-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -769,7 +763,7 @@ jobs:
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
30
.github/workflows/build_and_test.yml
vendored
30
.github/workflows/build_and_test.yml
vendored
@@ -203,7 +203,8 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64 ]
|
||||
build-type: [ debug, release ]
|
||||
# Do not build or run tests in debug for release branches
|
||||
build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
|
||||
include:
|
||||
- build-type: release
|
||||
arch: arm64
|
||||
@@ -213,6 +214,8 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
|
||||
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
|
||||
secrets: inherit
|
||||
|
||||
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
|
||||
@@ -833,6 +836,9 @@ jobs:
|
||||
rm -rf .docker-custom
|
||||
|
||||
promote-images:
|
||||
permissions:
|
||||
contents: read # This is required for actions/checkout
|
||||
id-token: write # This is required for Azure Login to work.
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -859,6 +865,28 @@ jobs:
|
||||
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
- name: Azure login
|
||||
if: github.ref_name == 'main'
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
|
||||
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
|
||||
|
||||
- name: Login to ACR
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
az acr login --name=neoneastus2
|
||||
|
||||
- name: Copy docker images to ACR-dev
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
|
||||
docker buildx imagetools create \
|
||||
-t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
|
||||
72
.github/workflows/pg-clients.yml
vendored
72
.github/workflows/pg-clients.yml
vendored
@@ -13,6 +13,7 @@ on:
|
||||
paths:
|
||||
- '.github/workflows/pg-clients.yml'
|
||||
- 'test_runner/pg_clients/**'
|
||||
- 'test_runner/logical_repl/**'
|
||||
- 'poetry.lock'
|
||||
workflow_dispatch:
|
||||
|
||||
@@ -49,6 +50,77 @@ jobs:
|
||||
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
test-logical-replication:
|
||||
needs: [ build-build-tools-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init --user root
|
||||
services:
|
||||
clickhouse:
|
||||
image: clickhouse/clickhouse-server:24.6.3.64
|
||||
ports:
|
||||
- 9000:9000
|
||||
- 8123:8123
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
|
||||
- name: Run tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: remote
|
||||
test_selection: logical_repl
|
||||
run_in_parallel: false
|
||||
extra_params: -m remote_cluster
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: always()
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: github.event.schedule && failure()
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
||||
slack-message: |
|
||||
Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
test-postgres-client-libs:
|
||||
needs: [ build-build-tools-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -1672,6 +1672,7 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"byteorder",
|
||||
"chrono",
|
||||
"diesel_derives",
|
||||
"itoa",
|
||||
"pq-sys",
|
||||
@@ -5718,6 +5719,7 @@ dependencies = [
|
||||
"aws-config",
|
||||
"bytes",
|
||||
"camino",
|
||||
"chrono",
|
||||
"clap",
|
||||
"control_plane",
|
||||
"diesel",
|
||||
|
||||
@@ -313,3 +313,5 @@ To get more familiar with this aspect, refer to:
|
||||
- Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices.
|
||||
- To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md).
|
||||
- To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html
|
||||
|
||||
.
|
||||
|
||||
@@ -514,7 +514,6 @@ impl LocalEnv {
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
// (allow unknown fields, unlike PageServerConf)
|
||||
struct PageserverConfigTomlSubset {
|
||||
id: NodeId,
|
||||
listen_pg_addr: String,
|
||||
listen_http_addr: String,
|
||||
pg_auth_type: AuthType,
|
||||
@@ -526,18 +525,30 @@ impl LocalEnv {
|
||||
.with_context(|| format!("read {:?}", config_toml_path))?,
|
||||
)
|
||||
.context("parse pageserver.toml")?;
|
||||
let identity_toml_path = dentry.path().join("identity.toml");
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
struct IdentityTomlSubset {
|
||||
id: NodeId,
|
||||
}
|
||||
let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
|
||||
&std::fs::read_to_string(&identity_toml_path)
|
||||
.with_context(|| format!("read {:?}", identity_toml_path))?,
|
||||
)
|
||||
.context("parse identity.toml")?;
|
||||
let PageserverConfigTomlSubset {
|
||||
id: config_toml_id,
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
} = config_toml;
|
||||
let IdentityTomlSubset {
|
||||
id: identity_toml_id,
|
||||
} = identity_toml;
|
||||
let conf = PageServerConf {
|
||||
id: {
|
||||
anyhow::ensure!(
|
||||
config_toml_id == id,
|
||||
"id mismatch: config_toml.id={config_toml_id} id={id}",
|
||||
identity_toml_id == id,
|
||||
"id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
|
||||
);
|
||||
id
|
||||
},
|
||||
|
||||
@@ -127,10 +127,13 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
// Apply the user-provided overrides
|
||||
overrides.push(
|
||||
toml_edit::ser::to_string_pretty(&conf)
|
||||
.expect("we deserialized this from toml earlier"),
|
||||
);
|
||||
overrides.push({
|
||||
let mut doc =
|
||||
toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
|
||||
// `id` is written out to `identity.toml` instead of `pageserver.toml`
|
||||
doc.remove("id").expect("it's part of the struct");
|
||||
doc.to_string()
|
||||
});
|
||||
|
||||
// Turn `overrides` into a toml document.
|
||||
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::collections::HashSet;
|
||||
use std::str::FromStr;
|
||||
use std::time::Instant;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
/// Request/response types for the storage controller
|
||||
/// API (`/control/v1` prefix). Implemented by the server
|
||||
@@ -294,6 +295,42 @@ pub enum PlacementPolicy {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
|
||||
/// Metadata health record posted from scrubber.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct MetadataHealthRecord {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub healthy: bool,
|
||||
pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct MetadataHealthUpdateRequest {
|
||||
pub healthy_tenant_shards: HashSet<TenantShardId>,
|
||||
pub unhealthy_tenant_shards: HashSet<TenantShardId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct MetadataHealthUpdateResponse {}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListUnhealthyResponse {
|
||||
pub unhealthy_tenant_shards: Vec<TenantShardId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListOutdatedRequest {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub not_scrubbed_for: Duration,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListOutdatedResponse {
|
||||
pub health_records: Vec<MetadataHealthRecord>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
@@ -355,7 +355,8 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
.blobs()
|
||||
.map(|k| ListingObject{
|
||||
key: self.name_to_relative_path(&k.name),
|
||||
last_modified: k.properties.last_modified.into()
|
||||
last_modified: k.properties.last_modified.into(),
|
||||
size: k.properties.content_length,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
@@ -153,6 +153,7 @@ pub enum ListingMode {
|
||||
pub struct ListingObject {
|
||||
pub key: RemotePath,
|
||||
pub last_modified: SystemTime,
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -194,7 +195,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>>;
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -351,10 +352,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &'a CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
|
||||
match self {
|
||||
Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
|
||||
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
|
||||
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
|
||||
Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||
Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||
Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||
|
||||
@@ -368,6 +368,7 @@ impl RemoteStorage for LocalFs {
|
||||
key: k.clone(),
|
||||
// LocalFs is just for testing, so just specify a dummy time
|
||||
last_modified: SystemTime::now(),
|
||||
size: 0,
|
||||
})
|
||||
}
|
||||
})
|
||||
@@ -411,6 +412,7 @@ impl RemoteStorage for LocalFs {
|
||||
key: RemotePath::from_string(&relative_key).unwrap(),
|
||||
// LocalFs is just for testing
|
||||
last_modified: SystemTime::now(),
|
||||
size: 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -565,9 +565,12 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
};
|
||||
|
||||
let size = object.size.unwrap_or(0) as u64;
|
||||
|
||||
result.keys.push(ListingObject{
|
||||
key,
|
||||
last_modified
|
||||
last_modified,
|
||||
size,
|
||||
});
|
||||
if let Some(mut mk) = max_keys {
|
||||
assert!(mk > 0);
|
||||
|
||||
@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> {
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
|
||||
async_stream::stream! {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum Scope {
|
||||
// Provides access to all data for a specific tenant (specified in `struct Claims` below)
|
||||
/// Provides access to all data for a specific tenant (specified in `struct Claims` below)
|
||||
// TODO: join these two?
|
||||
Tenant,
|
||||
// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
|
||||
// Should only be used e.g. for status check/tenant creation/list.
|
||||
/// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
|
||||
/// Should only be used e.g. for status check/tenant creation/list.
|
||||
PageServerApi,
|
||||
// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
|
||||
// Should only be used e.g. for status check.
|
||||
// Currently also used for connection from any pageserver to any safekeeper.
|
||||
/// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
|
||||
/// Should only be used e.g. for status check.
|
||||
/// Currently also used for connection from any pageserver to any safekeeper.
|
||||
SafekeeperData,
|
||||
// The scope used by pageservers in upcalls to storage controller and cloud control plane
|
||||
/// The scope used by pageservers in upcalls to storage controller and cloud control plane
|
||||
#[serde(rename = "generations_api")]
|
||||
GenerationsApi,
|
||||
// Allows access to control plane managment API and some storage controller endpoints.
|
||||
/// Allows access to control plane managment API and some storage controller endpoints.
|
||||
Admin,
|
||||
|
||||
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state
|
||||
|
||||
@@ -129,6 +129,7 @@ fn main() -> anyhow::Result<()> {
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.get_impl, "starting with get page implementation");
|
||||
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
|
||||
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
||||
|
||||
let tenants_path = conf.tenants_path();
|
||||
if !tenants_path.exists() {
|
||||
|
||||
@@ -29,6 +29,7 @@ use utils::{
|
||||
logging::LogFormat,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
|
||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -295,6 +296,10 @@ pub struct PageServerConf {
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
|
||||
pub l0_flush: L0FlushConfig,
|
||||
|
||||
/// This flag is temporary and will be removed after gradual rollout.
|
||||
/// See <https://github.com/neondatabase/neon/issues/8184>.
|
||||
pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -356,8 +361,6 @@ struct PageServerConfigBuilder {
|
||||
auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
|
||||
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
|
||||
|
||||
id: BuilderValue<NodeId>,
|
||||
|
||||
broker_endpoint: BuilderValue<Uri>,
|
||||
broker_keepalive_interval: BuilderValue<Duration>,
|
||||
|
||||
@@ -403,14 +406,13 @@ struct PageServerConfigBuilder {
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
|
||||
l0_flush: BuilderValue<L0FlushConfig>,
|
||||
|
||||
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
fn new(node_id: NodeId) -> Self {
|
||||
let mut this = Self::default();
|
||||
this.id(node_id);
|
||||
|
||||
this
|
||||
fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@@ -438,7 +440,6 @@ impl PageServerConfigBuilder {
|
||||
pg_auth_type: Set(AuthType::Trust),
|
||||
auth_validation_public_key_path: Set(None),
|
||||
remote_storage_config: Set(None),
|
||||
id: NotSet,
|
||||
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
|
||||
.parse()
|
||||
.expect("failed to parse default broker endpoint")),
|
||||
@@ -496,6 +497,7 @@ impl PageServerConfigBuilder {
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: Set(L0FlushConfig::default()),
|
||||
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -568,10 +570,6 @@ impl PageServerConfigBuilder {
|
||||
self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
|
||||
}
|
||||
|
||||
pub fn id(&mut self, node_id: NodeId) {
|
||||
self.id = BuilderValue::Set(node_id)
|
||||
}
|
||||
|
||||
pub fn log_format(&mut self, log_format: LogFormat) {
|
||||
self.log_format = BuilderValue::Set(log_format)
|
||||
}
|
||||
@@ -683,7 +681,11 @@ impl PageServerConfigBuilder {
|
||||
self.l0_flush = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
|
||||
self.compact_level0_phase1_value_access = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
macro_rules! conf {
|
||||
@@ -716,7 +718,6 @@ impl PageServerConfigBuilder {
|
||||
pg_auth_type,
|
||||
auth_validation_public_key_path,
|
||||
remote_storage_config,
|
||||
id,
|
||||
broker_endpoint,
|
||||
broker_keepalive_interval,
|
||||
log_format,
|
||||
@@ -741,9 +742,11 @@ impl PageServerConfigBuilder {
|
||||
image_compression,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
l0_flush,
|
||||
compact_level0_phase1_value_access,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
id: id,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new({
|
||||
@@ -893,7 +896,7 @@ impl PageServerConf {
|
||||
toml: &Document,
|
||||
workdir: &Utf8Path,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut builder = PageServerConfigBuilder::new(node_id);
|
||||
let mut builder = PageServerConfigBuilder::new();
|
||||
builder.workdir(workdir.to_owned());
|
||||
|
||||
let mut t_conf = TenantConfOpt::default();
|
||||
@@ -924,8 +927,6 @@ impl PageServerConf {
|
||||
"tenant_config" => {
|
||||
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
|
||||
}
|
||||
"id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
|
||||
// Logging is not set up yet, so we can't do it.
|
||||
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
||||
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
|
||||
"log_format" => builder.log_format(
|
||||
@@ -1014,11 +1015,14 @@ impl PageServerConf {
|
||||
"l0_flush" => {
|
||||
builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
|
||||
}
|
||||
"compact_level0_phase1_value_access" => {
|
||||
builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
|
||||
let mut conf = builder.build().context("invalid config")?;
|
||||
let mut conf = builder.build(node_id).context("invalid config")?;
|
||||
|
||||
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
|
||||
let auth_validation_public_key_path = conf
|
||||
@@ -1098,6 +1102,7 @@ impl PageServerConf {
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1255,7 +1260,6 @@ max_file_descriptors = 333
|
||||
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
initial_superuser_name = 'zzzz'
|
||||
id = 10
|
||||
|
||||
metric_collection_interval = '222 s'
|
||||
metric_collection_endpoint = 'http://localhost:80/metrics'
|
||||
@@ -1272,9 +1276,8 @@ background_task_maximum_delay = '334 s'
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
|
||||
// we have to create dummy values to overcome the validation errors
|
||||
let config_string = format!(
|
||||
"pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
|
||||
);
|
||||
let config_string =
|
||||
format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
|
||||
@@ -1341,6 +1344,7 @@ background_task_maximum_delay = '334 s'
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1415,6 +1419,7 @@ background_task_maximum_delay = '334 s'
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -1579,7 +1584,6 @@ broker_endpoint = '{broker_endpoint}'
|
||||
r#"pg_distrib_dir = "{pg_distrib_dir}"
|
||||
metric_collection_endpoint = "http://sample.url"
|
||||
metric_collection_interval = "10min"
|
||||
id = 222
|
||||
|
||||
[disk_usage_based_eviction]
|
||||
max_usage_pct = 80
|
||||
@@ -1649,7 +1653,6 @@ threshold = "20m"
|
||||
r#"pg_distrib_dir = "{pg_distrib_dir}"
|
||||
metric_collection_endpoint = "http://sample.url"
|
||||
metric_collection_interval = "10min"
|
||||
id = 222
|
||||
|
||||
[tenant_config]
|
||||
evictions_low_residence_duration_metric_threshold = "20m"
|
||||
|
||||
@@ -2129,14 +2129,24 @@ async fn secondary_download_handler(
|
||||
|
||||
let timeout = wait.unwrap_or(Duration::MAX);
|
||||
|
||||
let status = match tokio::time::timeout(
|
||||
let result = tokio::time::timeout(
|
||||
timeout,
|
||||
state.secondary_controller.download_tenant(tenant_shard_id),
|
||||
)
|
||||
.await
|
||||
{
|
||||
// Download job ran to completion.
|
||||
Ok(Ok(())) => StatusCode::OK,
|
||||
.await;
|
||||
|
||||
let progress = secondary_tenant.progress.lock().unwrap().clone();
|
||||
|
||||
let status = match result {
|
||||
Ok(Ok(())) => {
|
||||
if progress.layers_downloaded >= progress.layers_total {
|
||||
// Download job ran to completion
|
||||
StatusCode::OK
|
||||
} else {
|
||||
// Download dropped out without errors because it ran out of time budget
|
||||
StatusCode::ACCEPTED
|
||||
}
|
||||
}
|
||||
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
|
||||
// okay. We could get an error here in the unlikely edge case that the tenant
|
||||
// was detached between our check above and executing the download job.
|
||||
@@ -2146,8 +2156,6 @@ async fn secondary_download_handler(
|
||||
Err(_) => StatusCode::ACCEPTED,
|
||||
};
|
||||
|
||||
let progress = secondary_tenant.progress.lock().unwrap().clone();
|
||||
|
||||
json_response(status, progress)
|
||||
}
|
||||
|
||||
|
||||
@@ -2,13 +2,23 @@ use std::{num::NonZeroUsize, sync::Arc};
|
||||
|
||||
use crate::tenant::ephemeral_file;
|
||||
|
||||
#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum L0FlushConfig {
|
||||
#[default]
|
||||
PageCached,
|
||||
#[serde(rename_all = "snake_case")]
|
||||
Direct { max_concurrency: NonZeroUsize },
|
||||
Direct {
|
||||
max_concurrency: NonZeroUsize,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for L0FlushConfig {
|
||||
fn default() -> Self {
|
||||
Self::Direct {
|
||||
// TODO: using num_cpus results in different peak memory usage on different instance types.
|
||||
max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
|
||||
@@ -613,7 +613,23 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_compression_image_in_bytes_total",
|
||||
"Size of uncompressed data written into image layers"
|
||||
"Size of data written into image layers before compression"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_compression_image_in_bytes_considered",
|
||||
"Size of potentially compressible data written into image layers before compression"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_compression_image_in_bytes_chosen",
|
||||
"Size of data whose compressed form was written into image layers"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
@@ -8,8 +8,7 @@ use std::time::Duration;
|
||||
pub use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
/// A 'value' stored for a one Key.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(test, derive(PartialEq))]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum Value {
|
||||
/// An Image value contains a full copy of the value
|
||||
Image(Bytes),
|
||||
|
||||
@@ -28,6 +28,12 @@ use crate::virtual_file::VirtualFile;
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CompressionInfo {
|
||||
pub written_compressed: bool,
|
||||
pub compressed_size: Option<usize>,
|
||||
}
|
||||
|
||||
impl<'a> BlockCursor<'a> {
|
||||
/// Read a blob into a new buffer.
|
||||
pub async fn read_blob(
|
||||
@@ -273,8 +279,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
srcbuf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<u64, Error>) {
|
||||
self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
|
||||
.await
|
||||
let (buf, res) = self
|
||||
.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
|
||||
.await;
|
||||
(buf, res.map(|(off, _compression_info)| off))
|
||||
}
|
||||
|
||||
/// Write a blob of data. Returns the offset that it was written to,
|
||||
@@ -284,8 +292,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
srcbuf: B,
|
||||
ctx: &RequestContext,
|
||||
algorithm: ImageCompressionAlgorithm,
|
||||
) -> (B::Buf, Result<u64, Error>) {
|
||||
) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
|
||||
let offset = self.offset;
|
||||
let mut compression_info = CompressionInfo {
|
||||
written_compressed: false,
|
||||
compressed_size: None,
|
||||
};
|
||||
|
||||
let len = srcbuf.bytes_init();
|
||||
|
||||
@@ -328,7 +340,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
encoder.write_all(&slice[..]).await.unwrap();
|
||||
encoder.shutdown().await.unwrap();
|
||||
let compressed = encoder.into_inner();
|
||||
compression_info.compressed_size = Some(compressed.len());
|
||||
if compressed.len() < len {
|
||||
compression_info.written_compressed = true;
|
||||
let compressed_len = compressed.len();
|
||||
compressed_buf = Some(compressed);
|
||||
(BYTE_ZSTD, compressed_len, slice.into_inner())
|
||||
@@ -359,7 +373,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
} else {
|
||||
self.write_all(srcbuf, ctx).await
|
||||
};
|
||||
(srcbuf, res.map(|_| offset))
|
||||
(srcbuf, res.map(|_| (offset, compression_info)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -416,12 +430,14 @@ pub(crate) mod tests {
|
||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||
for blob in blobs.iter() {
|
||||
let (_, res) = if compression {
|
||||
wtr.write_blob_maybe_compressed(
|
||||
blob.clone(),
|
||||
ctx,
|
||||
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
||||
)
|
||||
.await
|
||||
let res = wtr
|
||||
.write_blob_maybe_compressed(
|
||||
blob.clone(),
|
||||
ctx,
|
||||
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
||||
)
|
||||
.await;
|
||||
(res.0, res.1.map(|(off, _)| off))
|
||||
} else {
|
||||
wtr.write_blob(blob.clone(), ctx).await
|
||||
};
|
||||
|
||||
@@ -296,13 +296,19 @@ where
|
||||
let mut stack = Vec::new();
|
||||
stack.push((self.root_blk, None));
|
||||
let block_cursor = self.reader.block_cursor();
|
||||
let mut node_buf = [0_u8; PAGE_SZ];
|
||||
while let Some((node_blknum, opt_iter)) = stack.pop() {
|
||||
// Locate the node.
|
||||
let node_buf = block_cursor
|
||||
// Read the node, through the PS PageCache, into local variable `node_buf`.
|
||||
// We could keep the page cache read guard alive, but, at the time of writing,
|
||||
// we run quite small PS PageCache s => can't risk running out of
|
||||
// PageCache space because this stream isn't consumed fast enough.
|
||||
let page_read_guard = block_cursor
|
||||
.read_blk(self.start_blk + node_blknum, ctx)
|
||||
.await?;
|
||||
node_buf.copy_from_slice(page_read_guard.as_ref());
|
||||
drop(page_read_guard); // drop page cache read guard early
|
||||
|
||||
let node = OnDiskNode::deparse(node_buf.as_ref())?;
|
||||
let node = OnDiskNode::deparse(&node_buf)?;
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
|
||||
@@ -345,6 +351,7 @@ where
|
||||
Either::Left(idx..node.num_children.into())
|
||||
};
|
||||
|
||||
|
||||
// idx points to the first match now. Keep going from there
|
||||
while let Some(idx) = iter.next() {
|
||||
let key_off = idx * suffix_len;
|
||||
|
||||
@@ -1384,34 +1384,32 @@ impl TenantManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let remote_path = remote_tenant_path(&tenant_shard_id);
|
||||
let keys = match self
|
||||
.resources
|
||||
.remote_storage
|
||||
.list(
|
||||
Some(&remote_path),
|
||||
remote_storage::ListingMode::NoDelimiter,
|
||||
None,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(listing) => listing.keys,
|
||||
Err(remote_storage::DownloadError::Cancelled) => {
|
||||
return Err(DeleteTenantError::Cancelled)
|
||||
}
|
||||
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
|
||||
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
|
||||
};
|
||||
let mut keys_stream = self.resources.remote_storage.list_streaming(
|
||||
Some(&remote_path),
|
||||
remote_storage::ListingMode::NoDelimiter,
|
||||
None,
|
||||
&self.cancel,
|
||||
);
|
||||
while let Some(chunk) = keys_stream.next().await {
|
||||
let keys = match chunk {
|
||||
Ok(listing) => listing.keys,
|
||||
Err(remote_storage::DownloadError::Cancelled) => {
|
||||
return Err(DeleteTenantError::Cancelled)
|
||||
}
|
||||
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
|
||||
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
|
||||
};
|
||||
|
||||
if keys.is_empty() {
|
||||
tracing::info!("Remote storage already deleted");
|
||||
} else {
|
||||
tracing::info!("Deleting {} keys from remote storage", keys.len());
|
||||
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
|
||||
self.resources
|
||||
.remote_storage
|
||||
.delete_objects(&keys, &self.cancel)
|
||||
.await?;
|
||||
if keys.is_empty() {
|
||||
tracing::info!("Remote storage already deleted");
|
||||
} else {
|
||||
tracing::info!("Deleting {} keys from remote storage", keys.len());
|
||||
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
|
||||
self.resources
|
||||
.remote_storage
|
||||
.delete_objects(&keys, &self.cancel)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -467,7 +467,7 @@ impl DeltaLayerWriterInner {
|
||||
.write_blob_maybe_compressed(val, ctx, compression)
|
||||
.await;
|
||||
let off = match res {
|
||||
Ok(off) => off,
|
||||
Ok((off, _)) => off,
|
||||
Err(e) => return (val, Err(anyhow::anyhow!(e))),
|
||||
};
|
||||
|
||||
|
||||
@@ -734,6 +734,14 @@ struct ImageLayerWriterInner {
|
||||
// Total uncompressed bytes passed into put_image
|
||||
uncompressed_bytes: u64,
|
||||
|
||||
// Like `uncompressed_bytes`,
|
||||
// but only of images we might consider for compression
|
||||
uncompressed_bytes_eligible: u64,
|
||||
|
||||
// Like `uncompressed_bytes`, but only of images
|
||||
// where we have chosen their compressed form
|
||||
uncompressed_bytes_chosen: u64,
|
||||
|
||||
blob_writer: BlobWriter<false>,
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
}
|
||||
@@ -790,6 +798,8 @@ impl ImageLayerWriterInner {
|
||||
tree: tree_builder,
|
||||
blob_writer,
|
||||
uncompressed_bytes: 0,
|
||||
uncompressed_bytes_eligible: 0,
|
||||
uncompressed_bytes_chosen: 0,
|
||||
};
|
||||
|
||||
Ok(writer)
|
||||
@@ -808,13 +818,22 @@ impl ImageLayerWriterInner {
|
||||
) -> anyhow::Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let compression = self.conf.image_compression;
|
||||
self.uncompressed_bytes += img.len() as u64;
|
||||
let uncompressed_len = img.len() as u64;
|
||||
self.uncompressed_bytes += uncompressed_len;
|
||||
let (_img, res) = self
|
||||
.blob_writer
|
||||
.write_blob_maybe_compressed(img, ctx, compression)
|
||||
.await;
|
||||
// TODO: re-use the buffer for `img` further upstack
|
||||
let off = res?;
|
||||
let (off, compression_info) = res?;
|
||||
if compression_info.compressed_size.is_some() {
|
||||
// The image has been considered for compression at least
|
||||
self.uncompressed_bytes_eligible += uncompressed_len;
|
||||
}
|
||||
if compression_info.written_compressed {
|
||||
// The image has been compressed
|
||||
self.uncompressed_bytes_chosen += uncompressed_len;
|
||||
}
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
@@ -837,6 +856,9 @@ impl ImageLayerWriterInner {
|
||||
// Calculate compression ratio
|
||||
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
|
||||
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
|
||||
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
|
||||
.inc_by(self.uncompressed_bytes_eligible);
|
||||
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
|
||||
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
|
||||
|
||||
let mut file = self.blob_writer.into_inner();
|
||||
|
||||
@@ -58,7 +58,7 @@ use std::{
|
||||
sync::atomic::AtomicU64,
|
||||
};
|
||||
use std::{
|
||||
cmp::{max, min, Ordering},
|
||||
cmp::{max, min},
|
||||
ops::ControlFlow,
|
||||
};
|
||||
use std::{
|
||||
@@ -177,25 +177,6 @@ impl std::fmt::Display for ImageLayerCreationMode {
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub(crate) struct Hole {
|
||||
key_range: Range<Key>,
|
||||
coverage_size: usize,
|
||||
}
|
||||
|
||||
impl Ord for Hole {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other.coverage_size.cmp(&self.coverage_size) // inverse order
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Hole {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
||||
/// Can be removed after all refactors are done.
|
||||
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
|
||||
|
||||
@@ -30,8 +30,8 @@ use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPA
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
|
||||
use crate::tenant::timeline::ImageLayerCreationOutcome;
|
||||
use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
use crate::tenant::DeltaLayer;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
@@ -608,66 +608,230 @@ impl Timeline {
|
||||
.read_lock_held_spawn_blocking_startup_micros
|
||||
.till_now();
|
||||
|
||||
// Determine N largest holes where N is number of compacted layers.
|
||||
let max_holes = deltas_to_compact.len();
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
|
||||
let min_hole_coverage_size = 3; // TODO: something more flexible?
|
||||
|
||||
// min-heap (reserve space for one more element added before eviction)
|
||||
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
|
||||
let mut prev: Option<Key> = None;
|
||||
|
||||
let mut all_keys = Vec::new();
|
||||
|
||||
for l in deltas_to_compact.iter() {
|
||||
all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
|
||||
}
|
||||
|
||||
// FIXME: should spawn_blocking the rest of this function
|
||||
|
||||
// The current stdlib sorting implementation is designed in a way where it is
|
||||
// particularly fast where the slice is made up of sorted sub-ranges.
|
||||
all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
|
||||
// TODO: replace with streaming k-merge
|
||||
let all_keys = {
|
||||
let mut all_keys = Vec::new();
|
||||
for l in deltas_to_compact.iter() {
|
||||
all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
|
||||
}
|
||||
// The current stdlib sorting implementation is designed in a way where it is
|
||||
// particularly fast where the slice is made up of sorted sub-ranges.
|
||||
all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
|
||||
all_keys
|
||||
};
|
||||
|
||||
stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
|
||||
|
||||
for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
|
||||
if let Some(prev_key) = prev {
|
||||
// just first fast filter, do not create hole entries for metadata keys. The last hole in the
|
||||
// compaction is the gap between data key and metadata keys.
|
||||
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
|
||||
&& !Key::is_metadata_key(&prev_key)
|
||||
{
|
||||
let key_range = prev_key..next_key;
|
||||
// Measuring hole by just subtraction of i128 representation of key range boundaries
|
||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
||||
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
|
||||
// That is why it is better to measure size of hole as number of covering image layers.
|
||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
|
||||
if coverage_size >= min_hole_coverage_size {
|
||||
heap.push(Hole {
|
||||
key_range,
|
||||
coverage_size,
|
||||
});
|
||||
if heap.len() > max_holes {
|
||||
heap.pop(); // remove smallest hole
|
||||
// Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start.
|
||||
//
|
||||
// A hole is a key range for which this compaction doesn't have any WAL records.
|
||||
// Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range,
|
||||
// cover the hole, but actually don't contain any WAL records for that key range.
|
||||
// The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`).
|
||||
// That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records.
|
||||
//
|
||||
// The algorithm chooses holes as follows.
|
||||
// - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys).
|
||||
// - Filter: min threshold on range length
|
||||
// - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data)
|
||||
//
|
||||
// For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451
|
||||
#[derive(PartialEq, Eq)]
|
||||
struct Hole {
|
||||
key_range: Range<Key>,
|
||||
coverage_size: usize,
|
||||
}
|
||||
let holes: Vec<Hole> = {
|
||||
use std::cmp::Ordering;
|
||||
impl Ord for Hole {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.coverage_size.cmp(&other.coverage_size).reverse()
|
||||
}
|
||||
}
|
||||
impl PartialOrd for Hole {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
let max_holes = deltas_to_compact.len();
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
|
||||
let min_hole_coverage_size = 3; // TODO: something more flexible?
|
||||
// min-heap (reserve space for one more element added before eviction)
|
||||
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
|
||||
let mut prev: Option<Key> = None;
|
||||
|
||||
for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
|
||||
if let Some(prev_key) = prev {
|
||||
// just first fast filter, do not create hole entries for metadata keys. The last hole in the
|
||||
// compaction is the gap between data key and metadata keys.
|
||||
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
|
||||
&& !Key::is_metadata_key(&prev_key)
|
||||
{
|
||||
let key_range = prev_key..next_key;
|
||||
// Measuring hole by just subtraction of i128 representation of key range boundaries
|
||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
||||
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
|
||||
// That is why it is better to measure size of hole as number of covering image layers.
|
||||
let coverage_size =
|
||||
layers.image_coverage(&key_range, last_record_lsn).len();
|
||||
if coverage_size >= min_hole_coverage_size {
|
||||
heap.push(Hole {
|
||||
key_range,
|
||||
coverage_size,
|
||||
});
|
||||
if heap.len() > max_holes {
|
||||
heap.pop(); // remove smallest hole
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
prev = Some(next_key.next());
|
||||
}
|
||||
prev = Some(next_key.next());
|
||||
}
|
||||
let mut holes = heap.into_vec();
|
||||
holes.sort_unstable_by_key(|hole| hole.key_range.start);
|
||||
holes
|
||||
};
|
||||
stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
|
||||
drop_rlock(guard);
|
||||
stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
|
||||
let mut holes = heap.into_vec();
|
||||
holes.sort_unstable_by_key(|hole| hole.key_range.start);
|
||||
let mut next_hole = 0; // index of next hole in holes vector
|
||||
|
||||
// This iterator walks through all key-value pairs from all the layers
|
||||
// we're compacting, in key, LSN order.
|
||||
let all_values_iter = all_keys.iter();
|
||||
// If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
|
||||
// then the Value::Image is ordered before Value::WalRecord.
|
||||
//
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
|
||||
// option and validation code once we've reached confidence.
|
||||
enum AllValuesIter<'a> {
|
||||
PageCachedBlobIo {
|
||||
all_keys_iter: VecIter<'a>,
|
||||
},
|
||||
StreamingKmergeBypassingPageCache {
|
||||
merge_iter: MergeIterator<'a>,
|
||||
},
|
||||
ValidatingStreamingKmergeBypassingPageCache {
|
||||
mode: CompactL0BypassPageCacheValidation,
|
||||
merge_iter: MergeIterator<'a>,
|
||||
all_keys_iter: VecIter<'a>,
|
||||
},
|
||||
}
|
||||
type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
|
||||
impl AllValuesIter<'_> {
|
||||
async fn next_all_keys_iter(
|
||||
iter: &mut VecIter<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
let Some(DeltaEntry {
|
||||
key,
|
||||
lsn,
|
||||
val: value_ref,
|
||||
..
|
||||
}) = iter.next()
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
let value = value_ref.load(ctx).await?;
|
||||
Ok(Some((*key, *lsn, value)))
|
||||
}
|
||||
async fn next(
|
||||
&mut self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
match self {
|
||||
AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
|
||||
Self::next_all_keys_iter(iter, ctx).await
|
||||
}
|
||||
AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
|
||||
AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
|
||||
// advance both iterators
|
||||
let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
|
||||
let merge_iter_item = merge_iter.next().await;
|
||||
// compare results & log warnings as needed
|
||||
macro_rules! rate_limited_warn {
|
||||
($($arg:tt)*) => {{
|
||||
if cfg!(debug_assertions) || cfg!(feature = "testing") {
|
||||
warn!($($arg)*);
|
||||
panic!("CompactL0BypassPageCacheValidation failure, check logs");
|
||||
}
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!($($arg)*);
|
||||
});
|
||||
}}
|
||||
}
|
||||
match (&all_keys_iter_item, &merge_iter_item) {
|
||||
(Err(_), Err(_)) => {
|
||||
// don't bother asserting equivality of the errors
|
||||
}
|
||||
(Err(all_keys), Ok(merge)) => {
|
||||
rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
|
||||
},
|
||||
(Ok(all_keys), Err(merge)) => {
|
||||
rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
|
||||
},
|
||||
(Ok(None), Ok(None)) => { }
|
||||
(Ok(Some(all_keys)), Ok(None)) => {
|
||||
rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
|
||||
}
|
||||
(Ok(None), Ok(Some(merge))) => {
|
||||
rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
|
||||
}
|
||||
(Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
|
||||
match mode {
|
||||
// TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
|
||||
CompactL0BypassPageCacheValidation::KeyLsn => {
|
||||
let all_keys = (all_keys_key, all_keys_lsn);
|
||||
let merge = (merge_key, merge_lsn);
|
||||
if all_keys != merge {
|
||||
rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
|
||||
}
|
||||
}
|
||||
CompactL0BypassPageCacheValidation::KeyLsnValue => {
|
||||
let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
|
||||
let merge = (merge_key, merge_lsn, merge_value);
|
||||
if all_keys != merge {
|
||||
rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// in case of mismatch, trust the legacy all_keys_iter_item
|
||||
all_keys_iter_item
|
||||
}.instrument(info_span!("next")).await
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
|
||||
CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
|
||||
all_keys_iter: all_keys.iter(),
|
||||
},
|
||||
CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
|
||||
let merge_iter = {
|
||||
let mut deltas = Vec::with_capacity(deltas_to_compact.len());
|
||||
for l in deltas_to_compact.iter() {
|
||||
let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
|
||||
deltas.push(l);
|
||||
}
|
||||
MergeIterator::create(&deltas, &[], ctx)
|
||||
};
|
||||
match validate {
|
||||
None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
|
||||
Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
|
||||
mode: validate.clone(),
|
||||
merge_iter,
|
||||
all_keys_iter: all_keys.iter(),
|
||||
},
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// This iterator walks through all keys and is needed to calculate size used by each key
|
||||
let mut all_keys_iter = all_keys
|
||||
@@ -738,12 +902,13 @@ impl Timeline {
|
||||
let mut key_values_total_size = 0u64;
|
||||
let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
|
||||
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
|
||||
let mut next_hole = 0; // index of next hole in holes vector
|
||||
|
||||
for &DeltaEntry {
|
||||
key, lsn, ref val, ..
|
||||
} in all_values_iter
|
||||
while let Some((key, lsn, value)) = all_values_iter
|
||||
.next(ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?
|
||||
{
|
||||
let value = val.load(ctx).await.map_err(CompactionError::Other)?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
@@ -928,6 +1093,10 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
// Without this, rustc complains about deltas_to_compact still
|
||||
// being borrowed when we `.into_iter()` below.
|
||||
drop(all_values_iter);
|
||||
|
||||
Ok(CompactLevel0Phase1Result {
|
||||
new_layers,
|
||||
deltas_to_compact: deltas_to_compact
|
||||
@@ -1035,6 +1204,43 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum CompactL0Phase1ValueAccess {
|
||||
/// The old way.
|
||||
PageCachedBlobIo,
|
||||
/// The new way.
|
||||
StreamingKmerge {
|
||||
/// If set, we run both the old way and the new way, validate that
|
||||
/// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
|
||||
/// and if the validation fails,
|
||||
/// - in tests: fail them with a panic or
|
||||
/// - in prod, log a rate-limited warning and use the old way's results.
|
||||
///
|
||||
/// If not set, we only run the new way and trust its results.
|
||||
validate: Option<CompactL0BypassPageCacheValidation>,
|
||||
},
|
||||
}
|
||||
|
||||
/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum CompactL0BypassPageCacheValidation {
|
||||
/// Validate that the series of (key, lsn) pairs are the same.
|
||||
KeyLsn,
|
||||
/// Validate that the entire output of old and new way is identical.
|
||||
KeyLsnValue,
|
||||
}
|
||||
|
||||
impl Default for CompactL0Phase1ValueAccess {
|
||||
fn default() -> Self {
|
||||
CompactL0Phase1ValueAccess::StreamingKmerge {
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
|
||||
validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// Entry point for new tiered compaction algorithm.
|
||||
///
|
||||
|
||||
153
poetry.lock
generated
153
poetry.lock
generated
@@ -870,6 +870,96 @@ files = [
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[[package]]
|
||||
name = "clickhouse-connect"
|
||||
version = "0.7.17"
|
||||
description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
|
||||
optional = false
|
||||
python-versions = "~=3.8"
|
||||
files = [
|
||||
{file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
|
||||
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
|
||||
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
|
||||
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
|
||||
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
|
||||
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
|
||||
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
|
||||
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
|
||||
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
|
||||
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
|
||||
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
|
||||
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
|
||||
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
|
||||
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
|
||||
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
|
||||
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
certifi = "*"
|
||||
lz4 = "*"
|
||||
pytz = "*"
|
||||
urllib3 = ">=1.26"
|
||||
zstandard = "*"
|
||||
|
||||
[package.extras]
|
||||
arrow = ["pyarrow"]
|
||||
numpy = ["numpy"]
|
||||
orjson = ["orjson"]
|
||||
pandas = ["pandas"]
|
||||
sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
|
||||
tzlocal = ["tzlocal (>=4.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.5"
|
||||
@@ -1470,6 +1560,56 @@ files = [
|
||||
{file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lz4"
|
||||
version = "4.3.3"
|
||||
description = "LZ4 Bindings for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
|
||||
{file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
|
||||
flake8 = ["flake8"]
|
||||
tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
|
||||
|
||||
[[package]]
|
||||
name = "markupsafe"
|
||||
version = "2.1.1"
|
||||
@@ -2361,6 +2501,17 @@ files = [
|
||||
[package.dependencies]
|
||||
six = ">=1.5"
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2024.1"
|
||||
description = "World timezone definitions, modern and historical"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
|
||||
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pywin32"
|
||||
version = "301"
|
||||
@@ -3206,4 +3357,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
|
||||
content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
|
||||
|
||||
@@ -5,4 +5,6 @@ pub use limit_algorithm::{
|
||||
};
|
||||
pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
|
||||
mod leaky_bucket;
|
||||
pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter};
|
||||
pub use leaky_bucket::{
|
||||
EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
|
||||
};
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::{
|
||||
hash::Hash,
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use ahash::RandomState;
|
||||
@@ -17,7 +16,7 @@ pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;
|
||||
|
||||
pub struct LeakyBucketRateLimiter<Key> {
|
||||
map: DashMap<Key, LeakyBucketState, RandomState>,
|
||||
config: LeakyBucketConfigInner,
|
||||
config: LeakyBucketConfig,
|
||||
access_count: AtomicUsize,
|
||||
}
|
||||
|
||||
@@ -30,7 +29,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
|
||||
pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
|
||||
Self {
|
||||
map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
|
||||
config: config.into(),
|
||||
config,
|
||||
access_count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
@@ -43,10 +42,10 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
|
||||
self.do_gc(now);
|
||||
}
|
||||
|
||||
let mut entry = self
|
||||
.map
|
||||
.entry(key)
|
||||
.or_insert_with(|| LeakyBucketState::new(now));
|
||||
let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState {
|
||||
time: now,
|
||||
filled: 0.0,
|
||||
});
|
||||
|
||||
entry.check(&self.config, now, n as f64)
|
||||
}
|
||||
@@ -60,7 +59,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
|
||||
let shard = thread_rng().gen_range(0..n);
|
||||
self.map.shards()[shard]
|
||||
.write()
|
||||
.retain(|_, value| value.get().should_retain(now));
|
||||
.retain(|_, value| !value.get_mut().update(&self.config, now));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,6 +68,11 @@ pub struct LeakyBucketConfig {
|
||||
pub max: f64,
|
||||
}
|
||||
|
||||
pub struct LeakyBucketState {
|
||||
filled: f64,
|
||||
time: Instant,
|
||||
}
|
||||
|
||||
impl LeakyBucketConfig {
|
||||
pub fn new(rps: f64, max: f64) -> Self {
|
||||
assert!(rps > 0.0, "rps must be positive");
|
||||
@@ -77,77 +81,41 @@ impl LeakyBucketConfig {
|
||||
}
|
||||
}
|
||||
|
||||
struct LeakyBucketConfigInner {
|
||||
/// "time cost" of a single request unit.
|
||||
/// loosely represents how long it takes to handle a request unit in active CPU time.
|
||||
time_cost: Duration,
|
||||
bucket_width: Duration,
|
||||
}
|
||||
|
||||
impl From<LeakyBucketConfig> for LeakyBucketConfigInner {
|
||||
fn from(config: LeakyBucketConfig) -> Self {
|
||||
// seconds_per_request = 1/(request_per_second)
|
||||
let spr = config.rps.recip();
|
||||
Self {
|
||||
time_cost: Duration::from_secs_f64(spr),
|
||||
bucket_width: Duration::from_secs_f64(config.max * spr),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct LeakyBucketState {
|
||||
/// Bucket is represented by `start..end` where `start = end - config.bucket_width`.
|
||||
///
|
||||
/// At any given time, `end - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
|
||||
/// Adding `n` tokens to the bucket is done by moving `end` forward by `n * config.time_cost`.
|
||||
/// If `now < start`, the bucket is considered filled and cannot accept any more tokens.
|
||||
/// Draining the bucket will happen naturally as `now` moves forward.
|
||||
///
|
||||
/// Let `n` be some "time cost" for the request,
|
||||
/// If now is after end, the bucket is empty and the end is reset to now,
|
||||
/// If now is within the `bucket window + n`, we are within time budget.
|
||||
/// If now is before the `bucket window + n`, we have run out of budget.
|
||||
///
|
||||
/// This is inspired by the generic cell rate algorithm (GCRA) and works
|
||||
/// exactly the same as a leaky-bucket.
|
||||
end: Instant,
|
||||
}
|
||||
|
||||
impl LeakyBucketState {
|
||||
fn new(now: Instant) -> Self {
|
||||
Self { end: now }
|
||||
}
|
||||
|
||||
fn should_retain(&self, now: Instant) -> bool {
|
||||
// if self.end is after now, the bucket is not empty
|
||||
now < self.end
|
||||
}
|
||||
|
||||
fn check(&mut self, config: &LeakyBucketConfigInner, now: Instant, n: f64) -> bool {
|
||||
let start = self.end - config.bucket_width;
|
||||
|
||||
let n = config.time_cost.mul_f64(n);
|
||||
|
||||
// start end
|
||||
// | start+n | end+n
|
||||
// | / | /
|
||||
// ------{o-[---------o-}--]----o----
|
||||
// now1 ^ now2 ^ ^ now3
|
||||
//
|
||||
// at now1, the bucket would be completely filled if we add n tokens.
|
||||
// at now2, the bucket would be partially filled if we add n tokens.
|
||||
// at now3, the bucket would start completely empty before we add n tokens.
|
||||
|
||||
if self.end + n <= now {
|
||||
self.end = now + n;
|
||||
true
|
||||
} else if start + n <= now {
|
||||
self.end += n;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
filled: 0.0,
|
||||
time: Instant::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// updates the timer and returns true if the bucket is empty
|
||||
fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool {
|
||||
let drain = now.duration_since(self.time);
|
||||
let drain = drain.as_secs_f64() * info.rps;
|
||||
|
||||
self.filled = (self.filled - drain).clamp(0.0, info.max);
|
||||
self.time = now;
|
||||
|
||||
self.filled == 0.0
|
||||
}
|
||||
|
||||
pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
|
||||
self.update(info, now);
|
||||
|
||||
if self.filled + n > info.max {
|
||||
return false;
|
||||
}
|
||||
self.filled += n;
|
||||
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LeakyBucketState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -156,50 +124,47 @@ mod tests {
|
||||
|
||||
use tokio::time::Instant;
|
||||
|
||||
use super::{LeakyBucketConfig, LeakyBucketConfigInner, LeakyBucketState};
|
||||
use super::{LeakyBucketConfig, LeakyBucketState};
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn check() {
|
||||
let config: LeakyBucketConfigInner = LeakyBucketConfig::new(500.0, 2000.0).into();
|
||||
assert_eq!(config.time_cost, Duration::from_millis(2));
|
||||
assert_eq!(config.bucket_width, Duration::from_secs(4));
|
||||
|
||||
let mut bucket = LeakyBucketState::new(Instant::now());
|
||||
let info = LeakyBucketConfig::new(500.0, 2000.0);
|
||||
let mut bucket = LeakyBucketState::new();
|
||||
|
||||
// should work for 2000 requests this second
|
||||
for _ in 0..2000 {
|
||||
assert!(bucket.check(&config, Instant::now(), 1.0));
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
}
|
||||
assert!(!bucket.check(&config, Instant::now(), 1.0));
|
||||
assert_eq!(bucket.end - Instant::now(), config.bucket_width);
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
assert_eq!(bucket.filled, 2000.0);
|
||||
|
||||
// in 1ms we should drain 0.5 tokens.
|
||||
// make sure we don't lose any tokens
|
||||
tokio::time::advance(Duration::from_millis(1)).await;
|
||||
assert!(!bucket.check(&config, Instant::now(), 1.0));
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
tokio::time::advance(Duration::from_millis(1)).await;
|
||||
assert!(bucket.check(&config, Instant::now(), 1.0));
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
|
||||
// in 10ms we should drain 5 tokens
|
||||
tokio::time::advance(Duration::from_millis(10)).await;
|
||||
for _ in 0..5 {
|
||||
assert!(bucket.check(&config, Instant::now(), 1.0));
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
}
|
||||
assert!(!bucket.check(&config, Instant::now(), 1.0));
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
|
||||
// in 10s we should drain 5000 tokens
|
||||
// but cap is only 2000
|
||||
tokio::time::advance(Duration::from_secs(10)).await;
|
||||
for _ in 0..2000 {
|
||||
assert!(bucket.check(&config, Instant::now(), 1.0));
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
}
|
||||
assert!(!bucket.check(&config, Instant::now(), 1.0));
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
|
||||
// should sustain 500rps
|
||||
for _ in 0..2000 {
|
||||
tokio::time::advance(Duration::from_millis(10)).await;
|
||||
for _ in 0..5 {
|
||||
assert!(bucket.check(&config, Instant::now(), 1.0));
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,6 +41,7 @@ zstandard = "^0.21.0"
|
||||
httpx = {extras = ["http2"], version = "^0.26.0"}
|
||||
pytest-repeat = "^0.9.3"
|
||||
websockets = "^12.0"
|
||||
clickhouse-connect = "^0.7.16"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
mypy = "==1.3.0"
|
||||
|
||||
@@ -18,6 +18,7 @@ anyhow.workspace = true
|
||||
aws-config.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
@@ -44,7 +45,12 @@ scopeguard.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
|
||||
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
|
||||
diesel = { version = "2.1.4", features = [
|
||||
"serde_json",
|
||||
"postgres",
|
||||
"r2d2",
|
||||
"chrono",
|
||||
] }
|
||||
diesel_migrations = { version = "2.1.0" }
|
||||
r2d2 = { version = "0.8.10" }
|
||||
|
||||
@@ -52,4 +58,3 @@ utils = { path = "../libs/utils/" }
|
||||
metrics = { path = "../libs/metrics/" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
DROP TABLE metadata_health;
|
||||
@@ -0,0 +1,14 @@
|
||||
CREATE TABLE metadata_health (
|
||||
tenant_id VARCHAR NOT NULL,
|
||||
shard_number INTEGER NOT NULL,
|
||||
shard_count INTEGER NOT NULL,
|
||||
PRIMARY KEY(tenant_id, shard_number, shard_count),
|
||||
-- Rely on cascade behavior for delete
|
||||
FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE,
|
||||
healthy BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
|
||||
INSERT INTO metadata_health(tenant_id, shard_number, shard_count)
|
||||
SELECT tenant_id, shard_number, shard_count FROM tenant_shards;
|
||||
@@ -10,7 +10,11 @@ use hyper::header::CONTENT_TYPE;
|
||||
use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use metrics::{BuildInfo, NeonMetrics};
|
||||
use pageserver_api::controller_api::TenantCreateRequest;
|
||||
use pageserver_api::controller_api::{
|
||||
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
|
||||
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
|
||||
TenantCreateRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||
TenantTimeTravelRequest, TimelineCreateRequest,
|
||||
@@ -560,6 +564,51 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Scrubber)?;
|
||||
|
||||
let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
|
||||
state.service.metadata_health_update(update_req).await?;
|
||||
|
||||
json_response(StatusCode::OK, MetadataHealthUpdateResponse {})
|
||||
}
|
||||
|
||||
async fn handle_metadata_health_list_unhealthy(
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
MetadataHealthListUnhealthyResponse {
|
||||
unhealthy_tenant_shards,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_metadata_health_list_outdated(
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
let health_records = state
|
||||
.service
|
||||
.metadata_health_list_outdated(list_outdated_req.not_scrubbed_for)
|
||||
.await?;
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
MetadataHealthListOutdatedResponse { health_records },
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_split(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
@@ -987,6 +1036,28 @@ pub fn make_router(
|
||||
RequestName("control_v1_cancel_node_fill"),
|
||||
)
|
||||
})
|
||||
// Metadata health operations
|
||||
.post("/control/v1/metadata_health/update", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_metadata_health_update,
|
||||
RequestName("control_v1_metadata_health_update"),
|
||||
)
|
||||
})
|
||||
.get("/control/v1/metadata_health/unhealthy", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_metadata_health_list_unhealthy,
|
||||
RequestName("control_v1_metadata_health_list_unhealthy"),
|
||||
)
|
||||
})
|
||||
.post("/control/v1/metadata_health/outdated", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_metadata_health_list_outdated,
|
||||
RequestName("control_v1_metadata_health_list_outdated"),
|
||||
)
|
||||
})
|
||||
// TODO(vlad): endpoint for cancelling drain and fill
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
|
||||
@@ -8,6 +8,7 @@ use self::split_state::SplitState;
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::controller_api::MetadataHealthRecord;
|
||||
use pageserver_api::controller_api::ShardSchedulingPolicy;
|
||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||
use pageserver_api::models::TenantConfig;
|
||||
@@ -90,6 +91,10 @@ pub(crate) enum DatabaseOperation {
|
||||
UpdateTenantShard,
|
||||
DeleteTenant,
|
||||
UpdateTenantConfig,
|
||||
UpdateMetadataHealth,
|
||||
ListMetadataHealth,
|
||||
ListMetadataHealthUnhealthy,
|
||||
ListMetadataHealthOutdated,
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
@@ -307,15 +312,32 @@ impl Persistence {
|
||||
&self,
|
||||
shards: Vec<TenantShardPersistence>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
use crate::schema::metadata_health;
|
||||
use crate::schema::tenant_shards;
|
||||
|
||||
let now = chrono::Utc::now();
|
||||
|
||||
let metadata_health_records = shards
|
||||
.iter()
|
||||
.map(|t| MetadataHealthPersistence {
|
||||
tenant_id: t.tenant_id.clone(),
|
||||
shard_number: t.shard_number,
|
||||
shard_count: t.shard_count,
|
||||
healthy: true,
|
||||
last_scrubbed_at: now,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::InsertTenantShards,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
for tenant in &shards {
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(tenant)
|
||||
.execute(conn)?;
|
||||
}
|
||||
diesel::insert_into(tenant_shards::table)
|
||||
.values(&shards)
|
||||
.execute(conn)?;
|
||||
|
||||
diesel::insert_into(metadata_health::table)
|
||||
.values(&metadata_health_records)
|
||||
.execute(conn)?;
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
@@ -329,10 +351,10 @@ impl Persistence {
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::DeleteTenant,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
// `metadata_health` status (if exists) is also deleted based on the cascade behavior.
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
@@ -675,6 +697,94 @@ impl Persistence {
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Stores all the latest metadata health updates durably. Updates existing entry on conflict.
|
||||
///
|
||||
/// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn update_metadata_health_records(
|
||||
&self,
|
||||
healthy_records: Vec<MetadataHealthPersistence>,
|
||||
unhealthy_records: Vec<MetadataHealthPersistence>,
|
||||
now: chrono::DateTime<chrono::Utc>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::metadata_health::dsl::*;
|
||||
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::UpdateMetadataHealth,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
diesel::insert_into(metadata_health)
|
||||
.values(&healthy_records)
|
||||
.on_conflict((tenant_id, shard_number, shard_count))
|
||||
.do_update()
|
||||
.set((healthy.eq(true), last_scrubbed_at.eq(now)))
|
||||
.execute(conn)?;
|
||||
|
||||
diesel::insert_into(metadata_health)
|
||||
.values(&unhealthy_records)
|
||||
.on_conflict((tenant_id, shard_number, shard_count))
|
||||
.do_update()
|
||||
.set((healthy.eq(false), last_scrubbed_at.eq(now)))
|
||||
.execute(conn)?;
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Lists all the metadata health records.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn list_metadata_health_records(
|
||||
&self,
|
||||
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::ListMetadataHealth,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
Ok(
|
||||
crate::schema::metadata_health::table
|
||||
.load::<MetadataHealthPersistence>(conn)?,
|
||||
)
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Lists all the metadata health records that is unhealthy.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn list_unhealthy_metadata_health_records(
|
||||
&self,
|
||||
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
|
||||
use crate::schema::metadata_health::dsl::*;
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::ListMetadataHealthUnhealthy,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::metadata_health::table
|
||||
.filter(healthy.eq(false))
|
||||
.load::<MetadataHealthPersistence>(conn)?)
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Lists all the metadata health records that have not been updated since an `earlier` time.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn list_outdated_metadata_health_records(
|
||||
&self,
|
||||
earlier: chrono::DateTime<chrono::Utc>,
|
||||
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
|
||||
use crate::schema::metadata_health::dsl::*;
|
||||
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::ListMetadataHealthOutdated,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
|
||||
let res = query.load::<MetadataHealthPersistence>(conn)?;
|
||||
|
||||
Ok(res)
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||
@@ -744,3 +854,59 @@ pub(crate) struct NodePersistence {
|
||||
pub(crate) listen_pg_addr: String,
|
||||
pub(crate) listen_pg_port: i32,
|
||||
}
|
||||
|
||||
/// Tenant metadata health status that are stored durably.
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
|
||||
#[diesel(table_name = crate::schema::metadata_health)]
|
||||
pub(crate) struct MetadataHealthPersistence {
|
||||
#[serde(default)]
|
||||
pub(crate) tenant_id: String,
|
||||
#[serde(default)]
|
||||
pub(crate) shard_number: i32,
|
||||
#[serde(default)]
|
||||
pub(crate) shard_count: i32,
|
||||
|
||||
pub(crate) healthy: bool,
|
||||
pub(crate) last_scrubbed_at: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
|
||||
impl MetadataHealthPersistence {
|
||||
pub fn new(
|
||||
tenant_shard_id: TenantShardId,
|
||||
healthy: bool,
|
||||
last_scrubbed_at: chrono::DateTime<chrono::Utc>,
|
||||
) -> Self {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_number = tenant_shard_id.shard_number.0 as i32;
|
||||
let shard_count = tenant_shard_id.shard_count.literal() as i32;
|
||||
|
||||
MetadataHealthPersistence {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
healthy,
|
||||
last_scrubbed_at,
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
|
||||
Ok(TenantShardId {
|
||||
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(self.shard_number as u8),
|
||||
shard_count: ShardCount::new(self.shard_count as u8),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<MetadataHealthPersistence> for MetadataHealthRecord {
|
||||
fn from(value: MetadataHealthPersistence) -> Self {
|
||||
MetadataHealthRecord {
|
||||
tenant_shard_id: value
|
||||
.get_tenant_shard_id()
|
||||
.expect("stored tenant id should be valid"),
|
||||
healthy: value.healthy,
|
||||
last_scrubbed_at: value.last_scrubbed_at,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,15 @@
|
||||
// @generated automatically by Diesel CLI.
|
||||
|
||||
diesel::table! {
|
||||
metadata_health (tenant_id, shard_number, shard_count) {
|
||||
tenant_id -> Varchar,
|
||||
shard_number -> Int4,
|
||||
shard_count -> Int4,
|
||||
healthy -> Bool,
|
||||
last_scrubbed_at -> Timestamptz,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
nodes (node_id) {
|
||||
node_id -> Int8,
|
||||
@@ -26,4 +36,4 @@ diesel::table! {
|
||||
}
|
||||
}
|
||||
|
||||
diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
|
||||
diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::{
|
||||
compute_hook::NotifyError,
|
||||
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
|
||||
metrics::LeadershipStatusGroup,
|
||||
persistence::{AbortShardSplitStatus, TenantFilter},
|
||||
persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
|
||||
reconciler::{ReconcileError, ReconcileUnits},
|
||||
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
|
||||
tenant_shard::{
|
||||
@@ -33,11 +33,11 @@ use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
|
||||
TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
|
||||
TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
|
||||
TenantShardMigrateResponse, UtilizationScore,
|
||||
MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
|
||||
NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
|
||||
TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
|
||||
},
|
||||
models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
|
||||
};
|
||||
@@ -6095,6 +6095,68 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Updates scrubber metadata health check results.
|
||||
pub(crate) async fn metadata_health_update(
|
||||
&self,
|
||||
update_req: MetadataHealthUpdateRequest,
|
||||
) -> Result<(), ApiError> {
|
||||
let now = chrono::offset::Utc::now();
|
||||
let (healthy_records, unhealthy_records) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let healthy_records = update_req
|
||||
.healthy_tenant_shards
|
||||
.into_iter()
|
||||
// Retain only health records associated with tenant shards managed by storage controller.
|
||||
.filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
|
||||
.map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now))
|
||||
.collect();
|
||||
let unhealthy_records = update_req
|
||||
.unhealthy_tenant_shards
|
||||
.into_iter()
|
||||
.filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
|
||||
.map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now))
|
||||
.collect();
|
||||
|
||||
(healthy_records, unhealthy_records)
|
||||
};
|
||||
|
||||
self.persistence
|
||||
.update_metadata_health_records(healthy_records, unhealthy_records, now)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Lists the tenant shards that has unhealthy metadata status.
|
||||
pub(crate) async fn metadata_health_list_unhealthy(
|
||||
&self,
|
||||
) -> Result<Vec<TenantShardId>, ApiError> {
|
||||
let result = self
|
||||
.persistence
|
||||
.list_unhealthy_metadata_health_records()
|
||||
.await?
|
||||
.iter()
|
||||
.map(|p| p.get_tenant_shard_id().unwrap())
|
||||
.collect();
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Lists the tenant shards that have not been scrubbed for some duration.
|
||||
pub(crate) async fn metadata_health_list_outdated(
|
||||
&self,
|
||||
not_scrubbed_for: Duration,
|
||||
) -> Result<Vec<MetadataHealthRecord>, ApiError> {
|
||||
let earlier = chrono::offset::Utc::now() - not_scrubbed_for;
|
||||
let result = self
|
||||
.persistence
|
||||
.list_outdated_metadata_health_records(earlier)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|record| record.into())
|
||||
.collect();
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
|
||||
self.inner.read().unwrap().get_leadership_status()
|
||||
}
|
||||
|
||||
@@ -40,6 +40,11 @@ impl TimelineAnalysis {
|
||||
garbage_keys: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether a timeline is healthy.
|
||||
pub(crate) fn is_healthy(&self) -> bool {
|
||||
self.errors.is_empty() && self.warnings.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
use std::pin::pin;
|
||||
|
||||
use futures::{StreamExt, TryStreamExt};
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use remote_storage::ListingMode;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{
|
||||
checks::parse_layer_object_name, init_remote, list_objects_with_retries,
|
||||
metadata_stream::stream_tenants, BucketConfig, NodeKind,
|
||||
checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
|
||||
stream_objects_with_retries, BucketConfig, NodeKind,
|
||||
};
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
|
||||
@@ -47,45 +50,38 @@ pub async fn find_large_objects(
|
||||
ignore_deltas: bool,
|
||||
concurrency: usize,
|
||||
) -> anyhow::Result<LargeObjectListing> {
|
||||
let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
|
||||
let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
|
||||
let (remote_client, target) =
|
||||
init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
|
||||
let tenants = pin!(stream_tenants_generic(&remote_client, &target));
|
||||
|
||||
let objects_stream = tenants.map_ok(|tenant_shard_id| {
|
||||
let mut tenant_root = target.tenant_root(&tenant_shard_id);
|
||||
let s3_client = s3_client.clone();
|
||||
let remote_client = remote_client.clone();
|
||||
async move {
|
||||
let mut objects = Vec::new();
|
||||
let mut total_objects_ctr = 0u64;
|
||||
// We want the objects and not just common prefixes
|
||||
tenant_root.delimiter.clear();
|
||||
let mut continuation_token = None;
|
||||
loop {
|
||||
let fetch_response =
|
||||
list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
|
||||
.await?;
|
||||
for obj in fetch_response.contents().iter().filter(|o| {
|
||||
if let Some(obj_size) = o.size {
|
||||
min_size as i64 <= obj_size
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}) {
|
||||
let key = obj.key().expect("couldn't get key").to_owned();
|
||||
let mut objects_stream = pin!(stream_objects_with_retries(
|
||||
&remote_client,
|
||||
ListingMode::NoDelimiter,
|
||||
&tenant_root
|
||||
));
|
||||
while let Some(listing) = objects_stream.next().await {
|
||||
let listing = listing?;
|
||||
for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) {
|
||||
let key = obj.key.to_string();
|
||||
let kind = LargeObjectKind::from_key(&key);
|
||||
if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
|
||||
continue;
|
||||
}
|
||||
objects.push(LargeObject {
|
||||
key,
|
||||
size: obj.size.unwrap() as u64,
|
||||
size: obj.size,
|
||||
kind,
|
||||
})
|
||||
}
|
||||
total_objects_ctr += fetch_response.contents().len() as u64;
|
||||
match fetch_response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
total_objects_ctr += listing.keys.len() as u64;
|
||||
}
|
||||
|
||||
Ok((tenant_shard_id, objects, total_objects_ctr))
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -18,7 +19,7 @@ use utils::id::TenantId;
|
||||
|
||||
use crate::{
|
||||
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
|
||||
init_remote, init_remote_generic,
|
||||
init_remote, init_remote_generic, list_objects_with_retries,
|
||||
metadata_stream::{stream_tenant_timelines, stream_tenants},
|
||||
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
|
||||
};
|
||||
@@ -27,6 +28,11 @@ use crate::{
|
||||
enum GarbageReason {
|
||||
DeletedInConsole,
|
||||
MissingInConsole,
|
||||
|
||||
// The remaining data relates to a known deletion issue, and we're sure that purging this
|
||||
// will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where
|
||||
// there is nothing in a tenant path apart from a heatmap file.
|
||||
KnownBug,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -72,6 +78,15 @@ impl GarbageList {
|
||||
}
|
||||
}
|
||||
|
||||
/// If an entity has been identified as requiring purge due to a known bug, e.g.
|
||||
/// a particular type of object left behind after an incomplete deletion.
|
||||
fn append_buggy(&mut self, entity: GarbageEntity) {
|
||||
self.items.push(GarbageItem {
|
||||
entity,
|
||||
reason: GarbageReason::KnownBug,
|
||||
});
|
||||
}
|
||||
|
||||
/// Return true if appended, false if not. False means the result was not garbage.
|
||||
fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
|
||||
where
|
||||
@@ -219,6 +234,71 @@ async fn find_garbage_inner(
|
||||
assert!(project.tenant == tenant_shard_id.tenant_id);
|
||||
}
|
||||
|
||||
// Special case: If it's missing in console, check for known bugs that would enable us to conclusively
|
||||
// identify it as purge-able anyway
|
||||
if console_result.is_none() {
|
||||
let timelines = stream_tenant_timelines(&s3_client, &target, tenant_shard_id)
|
||||
.await?
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
if timelines.is_empty() {
|
||||
// No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
|
||||
let tenant_objects = list_objects_with_retries(
|
||||
&s3_client,
|
||||
&target.tenant_root(&tenant_shard_id),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let object = tenant_objects.contents.as_ref().unwrap().first().unwrap();
|
||||
if object.key.as_ref().unwrap().ends_with("heatmap-v1.json") {
|
||||
tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
|
||||
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
|
||||
continue;
|
||||
} else {
|
||||
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key.as_ref().unwrap());
|
||||
}
|
||||
} else {
|
||||
// A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
|
||||
// rollout of WAL DR in which we never deleted these.
|
||||
let mut any_non_initdb = false;
|
||||
|
||||
for timeline_r in timelines {
|
||||
let timeline = timeline_r?;
|
||||
let timeline_objects = list_objects_with_retries(
|
||||
&s3_client,
|
||||
&target.timeline_root(&timeline),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
if timeline_objects
|
||||
.common_prefixes
|
||||
.as_ref()
|
||||
.map(|v| v.len())
|
||||
.unwrap_or(0)
|
||||
> 0
|
||||
{
|
||||
// Sub-paths? Unexpected
|
||||
any_non_initdb = true;
|
||||
} else {
|
||||
let object = timeline_objects.contents.as_ref().unwrap().first().unwrap();
|
||||
if object.key.as_ref().unwrap().ends_with("initdb.tar.zst") {
|
||||
tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
|
||||
} else {
|
||||
any_non_initdb = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if any_non_initdb {
|
||||
tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
|
||||
} else {
|
||||
tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
|
||||
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
|
||||
tracing::debug!("Tenant {tenant_shard_id} is garbage");
|
||||
} else {
|
||||
@@ -349,9 +429,6 @@ pub async fn get_timeline_objects(
|
||||
tracing::debug!("Listing objects in timeline {ttid}");
|
||||
let timeline_root = super::remote_timeline_path_id(&ttid);
|
||||
|
||||
// TODO: apply extra validation based on object modification time. Don't purge
|
||||
// timelines whose index_part.json has been touched recently.
|
||||
|
||||
let list = s3_client
|
||||
.list(
|
||||
Some(&timeline_root),
|
||||
@@ -422,6 +499,7 @@ impl DeletionProgressTracker {
|
||||
pub async fn purge_garbage(
|
||||
input_path: String,
|
||||
mode: PurgeMode,
|
||||
min_age: Duration,
|
||||
dry_run: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let list_bytes = tokio::fs::read(&input_path).await?;
|
||||
@@ -432,7 +510,7 @@ pub async fn purge_garbage(
|
||||
input_path
|
||||
);
|
||||
|
||||
let remote_client =
|
||||
let (remote_client, _target) =
|
||||
init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
|
||||
|
||||
assert_eq!(
|
||||
@@ -459,6 +537,7 @@ pub async fn purge_garbage(
|
||||
.filter(|i| match (&mode, &i.reason) {
|
||||
(PurgeMode::DeletedAndMissing, _) => true,
|
||||
(PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
|
||||
(PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true,
|
||||
(PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
|
||||
});
|
||||
|
||||
@@ -487,6 +566,37 @@ pub async fn purge_garbage(
|
||||
let mut progress_tracker = DeletionProgressTracker::default();
|
||||
while let Some(result) = get_objects_results.next().await {
|
||||
let mut object_list = result?;
|
||||
|
||||
// Extra safety check: even if a collection of objects is garbage, check max() of modification
|
||||
// times before purging, so that if we incorrectly marked a live tenant as garbage then we would
|
||||
// notice that its index has been written recently and would omit deleting it.
|
||||
if object_list.is_empty() {
|
||||
// Simplify subsequent code by ensuring list always has at least one item
|
||||
// Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes
|
||||
continue;
|
||||
}
|
||||
let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap();
|
||||
let age = max_mtime.elapsed();
|
||||
match age {
|
||||
Err(_) => {
|
||||
tracing::warn!("Bad last_modified time");
|
||||
continue;
|
||||
}
|
||||
Ok(a) if a < min_age => {
|
||||
// Failed age check. This doesn't mean we did something wrong: a tenant might really be garbage and recently
|
||||
// written, but out of an abundance of caution we still don't purge it.
|
||||
tracing::info!(
|
||||
"Skipping tenant with young objects {}..{}",
|
||||
object_list.first().as_ref().unwrap().key,
|
||||
object_list.last().as_ref().unwrap().key
|
||||
);
|
||||
continue;
|
||||
}
|
||||
Ok(_) => {
|
||||
// Passed age check
|
||||
}
|
||||
}
|
||||
|
||||
objects_to_delete.append(&mut object_list);
|
||||
if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
|
||||
do_delete(
|
||||
|
||||
@@ -22,16 +22,19 @@ use aws_sdk_s3::Client;
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::ValueEnum;
|
||||
use futures::{Stream, StreamExt};
|
||||
use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
|
||||
use pageserver::tenant::TENANTS_SEGMENT_NAME;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
};
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use storage_controller_client::control_api;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::error;
|
||||
use tracing_appender::non_blocking::WorkerGuard;
|
||||
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
|
||||
@@ -253,6 +256,12 @@ pub struct ControllerClientConfig {
|
||||
pub controller_jwt: String,
|
||||
}
|
||||
|
||||
impl ControllerClientConfig {
|
||||
pub fn build_client(self) -> control_api::Client {
|
||||
control_api::Client::new(self.controller_api, Some(self.controller_jwt))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ConsoleConfig {
|
||||
pub token: String,
|
||||
pub base_url: Url,
|
||||
@@ -319,27 +328,35 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
|
||||
}
|
||||
}
|
||||
|
||||
fn make_root_target(
|
||||
bucket_name: String,
|
||||
prefix_in_bucket: String,
|
||||
node_kind: NodeKind,
|
||||
) -> RootTarget {
|
||||
let s3_target = S3Target {
|
||||
bucket_name,
|
||||
prefix_in_bucket,
|
||||
delimiter: "/".to_string(),
|
||||
};
|
||||
match node_kind {
|
||||
NodeKind::Pageserver => RootTarget::Pageserver(s3_target),
|
||||
NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target),
|
||||
}
|
||||
}
|
||||
|
||||
async fn init_remote(
|
||||
bucket_config: BucketConfig,
|
||||
node_kind: NodeKind,
|
||||
) -> anyhow::Result<(Arc<Client>, RootTarget)> {
|
||||
let bucket_region = Region::new(bucket_config.region);
|
||||
let delimiter = "/".to_string();
|
||||
let s3_client = Arc::new(init_s3_client(bucket_region).await);
|
||||
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
|
||||
|
||||
let s3_root = match node_kind {
|
||||
NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
|
||||
bucket_name: bucket_config.bucket,
|
||||
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
|
||||
delimiter,
|
||||
}),
|
||||
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
|
||||
bucket_name: bucket_config.bucket,
|
||||
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
|
||||
delimiter,
|
||||
}),
|
||||
};
|
||||
let s3_root = make_root_target(
|
||||
bucket_config.bucket,
|
||||
bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
|
||||
node_kind,
|
||||
);
|
||||
|
||||
Ok((s3_client, s3_root))
|
||||
}
|
||||
@@ -347,12 +364,12 @@ async fn init_remote(
|
||||
async fn init_remote_generic(
|
||||
bucket_config: BucketConfig,
|
||||
node_kind: NodeKind,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
|
||||
let endpoint = env::var("AWS_ENDPOINT_URL").ok();
|
||||
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
|
||||
let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
|
||||
let storage = S3Config {
|
||||
bucket_name: bucket_config.bucket,
|
||||
bucket_name: bucket_config.bucket.clone(),
|
||||
bucket_region: bucket_config.region,
|
||||
prefix_in_bucket,
|
||||
endpoint,
|
||||
@@ -366,7 +383,13 @@ async fn init_remote_generic(
|
||||
storage: RemoteStorageKind::AwsS3(storage),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
GenericRemoteStorage::from_config(&storage_config).await
|
||||
|
||||
// We already pass the prefix to the remote client above
|
||||
let prefix_in_root_target = String::new();
|
||||
let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
|
||||
|
||||
let client = GenericRemoteStorage::from_config(&storage_config).await?;
|
||||
Ok((client, s3_root))
|
||||
}
|
||||
|
||||
async fn list_objects_with_retries(
|
||||
@@ -404,6 +427,44 @@ async fn list_objects_with_retries(
|
||||
Err(anyhow!("unreachable unless MAX_RETRIES==0"))
|
||||
}
|
||||
|
||||
fn stream_objects_with_retries<'a>(
|
||||
storage_client: &'a GenericRemoteStorage,
|
||||
listing_mode: ListingMode,
|
||||
s3_target: &'a S3Target,
|
||||
) -> impl Stream<Item = Result<Listing, anyhow::Error>> + 'a {
|
||||
async_stream::stream! {
|
||||
let mut trial = 0;
|
||||
let cancel = CancellationToken::new();
|
||||
let prefix_str = &s3_target
|
||||
.prefix_in_bucket
|
||||
.strip_prefix("/")
|
||||
.unwrap_or(&s3_target.prefix_in_bucket);
|
||||
let prefix = RemotePath::from_string(prefix_str)?;
|
||||
let mut list_stream =
|
||||
storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
|
||||
while let Some(res) = list_stream.next().await {
|
||||
if let Err(err) = res {
|
||||
let yield_err = if err.is_permanent() {
|
||||
true
|
||||
} else {
|
||||
let backoff_time = 1 << trial.max(5);
|
||||
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
|
||||
trial += 1;
|
||||
trial == MAX_RETRIES - 1
|
||||
};
|
||||
if yield_err {
|
||||
yield Err(err)
|
||||
.with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
trial = 0;
|
||||
yield res.map_err(anyhow::Error::from);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_object_with_retries(
|
||||
s3_client: &Client,
|
||||
bucket_name: &str,
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use anyhow::{anyhow, bail};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use reqwest::Url;
|
||||
use reqwest::{Method, Url};
|
||||
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
||||
use storage_scrubber::pageserver_physical_gc::GcMode;
|
||||
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
|
||||
@@ -50,6 +51,8 @@ enum Command {
|
||||
input_path: String,
|
||||
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
|
||||
mode: PurgeMode,
|
||||
#[arg(long = "min-age")]
|
||||
min_age: humantime::Duration,
|
||||
},
|
||||
#[command(verbatim_doc_comment)]
|
||||
ScanMetadata {
|
||||
@@ -59,6 +62,8 @@ enum Command {
|
||||
json: bool,
|
||||
#[arg(long = "tenant-id", num_args = 0..)]
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
#[arg(long = "post", default_value_t = false)]
|
||||
post_to_storage_controller: bool,
|
||||
#[arg(long, default_value = None)]
|
||||
/// For safekeeper node_kind only, points to db with debug dump
|
||||
dump_db_connstr: Option<String>,
|
||||
@@ -114,11 +119,20 @@ async fn main() -> anyhow::Result<()> {
|
||||
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
|
||||
));
|
||||
|
||||
let controller_client_conf = cli.controller_api.map(|controller_api| {
|
||||
ControllerClientConfig {
|
||||
controller_api,
|
||||
// Default to no key: this is a convenience when working in a development environment
|
||||
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
|
||||
}
|
||||
});
|
||||
|
||||
match cli.command {
|
||||
Command::ScanMetadata {
|
||||
json,
|
||||
tenant_ids,
|
||||
node_kind,
|
||||
post_to_storage_controller,
|
||||
dump_db_connstr,
|
||||
dump_db_table,
|
||||
} => {
|
||||
@@ -157,6 +171,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
if controller_client_conf.is_none() && post_to_storage_controller {
|
||||
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
|
||||
}
|
||||
match scan_metadata(bucket_config.clone(), tenant_ids).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
@@ -168,6 +185,21 @@ async fn main() -> anyhow::Result<()> {
|
||||
} else {
|
||||
println!("{}", summary.summary_string());
|
||||
}
|
||||
|
||||
if post_to_storage_controller {
|
||||
if let Some(conf) = controller_client_conf {
|
||||
let controller_client = conf.build_client();
|
||||
let body = summary.build_health_update_request();
|
||||
controller_client
|
||||
.dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
|
||||
Method::POST,
|
||||
"control/v1/metadata_health/update".to_string(),
|
||||
Some(body),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
if summary.is_fatal() {
|
||||
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
||||
} else if summary.is_empty() {
|
||||
@@ -196,9 +228,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
let console_config = ConsoleConfig::from_env()?;
|
||||
find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
|
||||
}
|
||||
Command::PurgeGarbage { input_path, mode } => {
|
||||
purge_garbage(input_path, mode, !cli.delete).await
|
||||
}
|
||||
Command::PurgeGarbage {
|
||||
input_path,
|
||||
mode,
|
||||
min_age,
|
||||
} => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await,
|
||||
Command::TenantSnapshot {
|
||||
tenant_id,
|
||||
output_path,
|
||||
@@ -213,14 +247,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
min_age,
|
||||
mode,
|
||||
} => {
|
||||
let controller_client_conf = cli.controller_api.map(|controller_api| {
|
||||
ControllerClientConfig {
|
||||
controller_api,
|
||||
// Default to no key: this is a convenience when working in a development environment
|
||||
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
|
||||
}
|
||||
});
|
||||
|
||||
match (&controller_client_conf, mode) {
|
||||
(Some(_), _) => {
|
||||
// Any mode may run when controller API is set
|
||||
|
||||
@@ -1,12 +1,41 @@
|
||||
use anyhow::Context;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use async_stream::{stream, try_stream};
|
||||
use aws_sdk_s3::{types::ObjectIdentifier, Client};
|
||||
use futures::StreamExt;
|
||||
use remote_storage::{GenericRemoteStorage, ListingMode};
|
||||
use tokio_stream::Stream;
|
||||
|
||||
use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
|
||||
use crate::{
|
||||
list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
|
||||
TenantShardTimelineId,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
|
||||
pub fn stream_tenants_generic<'a>(
|
||||
remote_client: &'a GenericRemoteStorage,
|
||||
target: &'a RootTarget,
|
||||
) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
|
||||
try_stream! {
|
||||
let tenants_target = target.tenants_root();
|
||||
let mut tenants_stream =
|
||||
std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
|
||||
while let Some(chunk) = tenants_stream.next().await {
|
||||
let chunk = chunk?;
|
||||
let entry_ids = chunk.prefixes.iter()
|
||||
.map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'")));
|
||||
for dir_name_res in entry_ids {
|
||||
let dir_name = dir_name_res?;
|
||||
let id = TenantShardId::from_str(dir_name)?;
|
||||
yield id;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
|
||||
pub fn stream_tenants<'a>(
|
||||
s3_client: &'a Client,
|
||||
|
||||
@@ -567,13 +567,7 @@ pub async fn pageserver_physical_gc(
|
||||
}
|
||||
|
||||
// Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
|
||||
let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
|
||||
let ControllerClientConfig {
|
||||
controller_api,
|
||||
controller_jwt,
|
||||
} = c;
|
||||
control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
|
||||
}) else {
|
||||
let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
|
||||
tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
|
||||
return Ok(summary);
|
||||
};
|
||||
|
||||
@@ -9,12 +9,13 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
|
||||
use aws_sdk_s3::Client;
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use pageserver::tenant::remote_timeline_client::remote_layer_path;
|
||||
use pageserver_api::controller_api::MetadataHealthUpdateRequest;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use serde::Serialize;
|
||||
use utils::id::TenantId;
|
||||
use utils::shard::ShardCount;
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct MetadataSummary {
|
||||
tenant_count: usize,
|
||||
timeline_count: usize,
|
||||
@@ -23,19 +24,16 @@ pub struct MetadataSummary {
|
||||
with_warnings: HashSet<TenantShardTimelineId>,
|
||||
with_orphans: HashSet<TenantShardTimelineId>,
|
||||
indices_by_version: HashMap<usize, usize>,
|
||||
|
||||
#[serde(skip)]
|
||||
pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
|
||||
#[serde(skip)]
|
||||
pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
|
||||
}
|
||||
|
||||
impl MetadataSummary {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
tenant_count: 0,
|
||||
timeline_count: 0,
|
||||
timeline_shard_count: 0,
|
||||
with_errors: HashSet::new(),
|
||||
with_warnings: HashSet::new(),
|
||||
with_orphans: HashSet::new(),
|
||||
indices_by_version: HashMap::new(),
|
||||
}
|
||||
Self::default()
|
||||
}
|
||||
|
||||
fn update_data(&mut self, data: &S3TimelineBlobData) {
|
||||
@@ -54,6 +52,13 @@ impl MetadataSummary {
|
||||
}
|
||||
|
||||
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
|
||||
if analysis.is_healthy() {
|
||||
self.healthy_tenant_shards.insert(id.tenant_shard_id);
|
||||
} else {
|
||||
self.healthy_tenant_shards.remove(&id.tenant_shard_id);
|
||||
self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
|
||||
}
|
||||
|
||||
if !analysis.errors.is_empty() {
|
||||
self.with_errors.insert(*id);
|
||||
}
|
||||
@@ -101,6 +106,13 @@ Index versions: {version_summary}
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.timeline_shard_count == 0
|
||||
}
|
||||
|
||||
pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
|
||||
MetadataHealthUpdateRequest {
|
||||
healthy_tenant_shards: self.healthy_tenant_shards.clone(),
|
||||
unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
|
||||
|
||||
@@ -222,6 +222,8 @@ class NeonBenchmarker:
|
||||
function by the zenbenchmark fixture
|
||||
"""
|
||||
|
||||
PROPERTY_PREFIX = "neon_benchmarker_"
|
||||
|
||||
def __init__(self, property_recorder: Callable[[str, object], None]):
|
||||
# property recorder here is a pytest fixture provided by junitxml module
|
||||
# https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property
|
||||
@@ -238,7 +240,7 @@ class NeonBenchmarker:
|
||||
Record a benchmark result.
|
||||
"""
|
||||
# just to namespace the value
|
||||
name = f"neon_benchmarker_{metric_name}"
|
||||
name = f"{self.PROPERTY_PREFIX}_{metric_name}"
|
||||
self.property_recorder(
|
||||
name,
|
||||
{
|
||||
@@ -249,6 +251,18 @@ class NeonBenchmarker:
|
||||
},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def records(
|
||||
cls, user_properties: list[tuple[str, object]]
|
||||
) -> Iterator[tuple[str, dict[str, object]]]:
|
||||
"""
|
||||
Yield all records related to benchmarks
|
||||
"""
|
||||
for property_name, recorded_property in user_properties:
|
||||
if property_name.startswith(cls.PROPERTY_PREFIX):
|
||||
assert isinstance(recorded_property, dict)
|
||||
yield recorded_property["name"], recorded_property
|
||||
|
||||
@contextmanager
|
||||
def record_duration(self, metric_name: str) -> Iterator[None]:
|
||||
"""
|
||||
@@ -425,10 +439,11 @@ def zenbenchmark(
|
||||
yield benchmarker
|
||||
|
||||
results = {}
|
||||
for _, recorded_property in request.node.user_properties:
|
||||
for _, recorded_property in NeonBenchmarker.records(request.node.user_properties):
|
||||
name = recorded_property["name"]
|
||||
value = str(recorded_property["value"])
|
||||
if (unit := recorded_property["unit"].strip()) != "":
|
||||
unit = str(recorded_property["unit"]).strip()
|
||||
if unit != "":
|
||||
value += f" {unit}"
|
||||
results[name] = value
|
||||
|
||||
@@ -477,7 +492,7 @@ def pytest_terminal_summary(
|
||||
for test_report in terminalreporter.stats.get("passed", []):
|
||||
result_entry = []
|
||||
|
||||
for _, recorded_property in test_report.user_properties:
|
||||
for _, recorded_property in NeonBenchmarker.records(test_report.user_properties):
|
||||
if not is_header_printed:
|
||||
terminalreporter.section("Benchmark results", "-")
|
||||
is_header_printed = True
|
||||
|
||||
@@ -449,6 +449,7 @@ class TokenScope(str, Enum):
|
||||
GENERATIONS_API = "generations_api"
|
||||
SAFEKEEPER_DATA = "safekeeperdata"
|
||||
TENANT = "tenant"
|
||||
SCRUBBER = "scrubber"
|
||||
|
||||
|
||||
class NeonEnvBuilder:
|
||||
@@ -1441,6 +1442,7 @@ def neon_env_builder(
|
||||
pageserver_virtual_file_io_engine: str,
|
||||
pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore],
|
||||
record_property: Callable[[str, object], None],
|
||||
) -> Iterator[NeonEnvBuilder]:
|
||||
"""
|
||||
Fixture to create a Neon environment for test.
|
||||
@@ -1480,9 +1482,7 @@ def neon_env_builder(
|
||||
yield builder
|
||||
# Propogate `preserve_database_files` to make it possible to use in other fixtures,
|
||||
# like `test_output_dir` fixture for attaching all database files to Allure report.
|
||||
request.node.user_properties.append(
|
||||
("preserve_database_files", builder.preserve_database_files)
|
||||
)
|
||||
record_property("preserve_database_files", builder.preserve_database_files)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -2587,6 +2587,51 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
|
||||
time.sleep(backoff)
|
||||
|
||||
def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]):
|
||||
body: Dict[str, Any] = {
|
||||
"healthy_tenant_shards": [str(t) for t in healthy],
|
||||
"unhealthy_tenant_shards": [str(t) for t in unhealthy],
|
||||
}
|
||||
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.SCRUBBER),
|
||||
)
|
||||
|
||||
def metadata_health_list_unhealthy(self):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def metadata_health_list_outdated(self, duration: str):
|
||||
body: Dict[str, Any] = {"not_scrubbed_for": duration}
|
||||
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool:
|
||||
"""Metadata is healthy if there is no unhealthy or outdated health records."""
|
||||
|
||||
unhealthy = self.metadata_health_list_unhealthy()
|
||||
outdated = self.metadata_health_list_outdated(outdated_duration)
|
||||
|
||||
healthy = (
|
||||
len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0
|
||||
)
|
||||
if not healthy:
|
||||
log.info(f"{unhealthy=}, {outdated=}")
|
||||
return healthy
|
||||
|
||||
def step_down(self):
|
||||
log.info("Asking storage controller to step down")
|
||||
response = self.request(
|
||||
@@ -4356,10 +4401,11 @@ class StorageScrubber:
|
||||
assert stdout is not None
|
||||
return stdout
|
||||
|
||||
def scan_metadata(self) -> Any:
|
||||
stdout = self.scrubber_cli(
|
||||
["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
|
||||
)
|
||||
def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
|
||||
args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
|
||||
if post_to_storage_controller:
|
||||
args.append("--post")
|
||||
stdout = self.scrubber_cli(args, timeout=30)
|
||||
|
||||
try:
|
||||
return json.loads(stdout)
|
||||
|
||||
88
test_runner/logical_repl/test_log_repl.py
Normal file
88
test_runner/logical_repl/test_log_repl.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""
|
||||
Test the logical replication in Neon with the different consumers
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import time
|
||||
|
||||
import clickhouse_connect
|
||||
import psycopg2
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import RemotePostgres
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
def query_clickhouse(
|
||||
client,
|
||||
query: str,
|
||||
digest: str,
|
||||
) -> None:
|
||||
"""
|
||||
Run the query on the client
|
||||
return answer if successful, raise an exception otherwise
|
||||
"""
|
||||
log.debug("Query: %s", query)
|
||||
res = client.query(query)
|
||||
log.debug(res.result_rows)
|
||||
m = hashlib.sha1()
|
||||
m.update(repr(tuple(res.result_rows)).encode())
|
||||
hash_res = m.hexdigest()
|
||||
log.debug("Hash: %s", hash_res)
|
||||
if hash_res == digest:
|
||||
return
|
||||
raise ValueError("Hash mismatch")
|
||||
|
||||
|
||||
@pytest.mark.remote_cluster
|
||||
def test_clickhouse(remote_pg: RemotePostgres):
|
||||
"""
|
||||
Test the logical replication having ClickHouse as a client
|
||||
"""
|
||||
conn_options = remote_pg.conn_options()
|
||||
for _ in range(5):
|
||||
try:
|
||||
conn = psycopg2.connect(remote_pg.connstr())
|
||||
except psycopg2.OperationalError as perr:
|
||||
log.debug(perr)
|
||||
time.sleep(1)
|
||||
else:
|
||||
break
|
||||
raise TimeoutError
|
||||
cur = conn.cursor()
|
||||
cur.execute("DROP TABLE IF EXISTS table1")
|
||||
cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
|
||||
cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
|
||||
conn.commit()
|
||||
client = clickhouse_connect.get_client(host="clickhouse")
|
||||
client.command("SET allow_experimental_database_materialized_postgresql=1")
|
||||
client.command(
|
||||
"CREATE DATABASE db1_postgres ENGINE = "
|
||||
f"MaterializedPostgreSQL('{conn_options['host']}', "
|
||||
f"'{conn_options['dbname']}', "
|
||||
f"'{conn_options['user']}', '{conn_options['password']}') "
|
||||
"SETTINGS materialized_postgresql_tables_list = 'table1';"
|
||||
)
|
||||
wait_until(
|
||||
120,
|
||||
0.5,
|
||||
lambda: query_clickhouse(
|
||||
client,
|
||||
"select * from db1_postgres.table1 order by 1",
|
||||
"ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
|
||||
),
|
||||
)
|
||||
cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
|
||||
conn.commit()
|
||||
wait_until(
|
||||
120,
|
||||
0.5,
|
||||
lambda: query_clickhouse(
|
||||
client,
|
||||
"select * from db1_postgres.table1 order by 1",
|
||||
"9eba2daaf7e4d7d27ac849525f68b562ab53947d",
|
||||
),
|
||||
)
|
||||
log.debug("Sleeping before final checking if Neon is still alive")
|
||||
time.sleep(3)
|
||||
cur.execute("SELECT 1")
|
||||
@@ -389,6 +389,11 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
|
||||
repeat_result = ps_http.timeline_create(
|
||||
env.pg_version, env.initial_tenant, success_timeline, timeout=60
|
||||
)
|
||||
# remote_consistent_lsn_visible will be published only after we've
|
||||
# confirmed the generation, which is not part of what we await during
|
||||
# timeline creation (uploads). mask it out here to avoid flakyness.
|
||||
del success_result["remote_consistent_lsn_visible"]
|
||||
del repeat_result["remote_consistent_lsn_visible"]
|
||||
assert repeat_result == success_result
|
||||
finally:
|
||||
env.pageserver.stop(immediate=True)
|
||||
|
||||
@@ -3,7 +3,7 @@ import threading
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Union
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
import pytest
|
||||
from fixtures.common_types import TenantId, TenantShardId, TimelineId
|
||||
@@ -1785,6 +1785,126 @@ def test_storage_controller_node_deletion(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shard_count", [None, 2])
|
||||
def test_storage_controller_metadata_health(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
shard_count: Optional[int],
|
||||
):
|
||||
"""
|
||||
Create three tenants A, B, C.
|
||||
|
||||
Phase 1:
|
||||
- A: Post healthy status.
|
||||
- B: Post unhealthy status.
|
||||
- C: No updates.
|
||||
|
||||
Phase 2:
|
||||
- B: Post healthy status.
|
||||
- C: Post healthy status.
|
||||
|
||||
Phase 3:
|
||||
- A: Post unhealthy status.
|
||||
|
||||
Phase 4:
|
||||
- Delete tenant A, metadata health status should be deleted as well.
|
||||
"""
|
||||
|
||||
def update_and_query_metadata_health(
|
||||
env: NeonEnv,
|
||||
healthy: List[TenantShardId],
|
||||
unhealthy: List[TenantShardId],
|
||||
outdated_duration: str = "1h",
|
||||
) -> Tuple[Set[str], Set[str]]:
|
||||
"""
|
||||
Update metadata health. Then list tenant shards with unhealthy and
|
||||
outdated metadata health status.
|
||||
"""
|
||||
if healthy or unhealthy:
|
||||
env.storage_controller.metadata_health_update(healthy, unhealthy)
|
||||
result = env.storage_controller.metadata_health_list_unhealthy()
|
||||
unhealthy_res = set(result["unhealthy_tenant_shards"])
|
||||
result = env.storage_controller.metadata_health_list_outdated(outdated_duration)
|
||||
outdated_res = set(record["tenant_shard_id"] for record in result["health_records"])
|
||||
|
||||
return unhealthy_res, outdated_res
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Mock tenant (`initial_tenant``) with healthy scrubber scan result
|
||||
tenant_a_shard_ids = (
|
||||
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=shard_count)
|
||||
if shard_count is not None
|
||||
else [TenantShardId(env.initial_tenant, 0, 0)]
|
||||
)
|
||||
|
||||
# Mock tenant with unhealthy scrubber scan result
|
||||
tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count)
|
||||
tenant_b_shard_ids = (
|
||||
env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count)
|
||||
if shard_count is not None
|
||||
else [TenantShardId(tenant_b, 0, 0)]
|
||||
)
|
||||
|
||||
# Mock tenant that never gets a health update from scrubber
|
||||
tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count)
|
||||
|
||||
tenant_c_shard_ids = (
|
||||
env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count)
|
||||
if shard_count is not None
|
||||
else [TenantShardId(tenant_c, 0, 0)]
|
||||
)
|
||||
|
||||
# Metadata health table also updated as tenant shards are created.
|
||||
assert env.storage_controller.metadata_health_is_healthy()
|
||||
|
||||
# post "fake" updates to storage controller db
|
||||
|
||||
unhealthy, outdated = update_and_query_metadata_health(
|
||||
env, healthy=tenant_a_shard_ids, unhealthy=tenant_b_shard_ids
|
||||
)
|
||||
|
||||
log.info(f"After Phase 1: {unhealthy=}, {outdated=}")
|
||||
assert len(unhealthy) == len(tenant_b_shard_ids)
|
||||
for t in tenant_b_shard_ids:
|
||||
assert str(t) in unhealthy
|
||||
assert len(outdated) == 0
|
||||
|
||||
unhealthy, outdated = update_and_query_metadata_health(
|
||||
env, healthy=tenant_b_shard_ids + tenant_c_shard_ids, unhealthy=[]
|
||||
)
|
||||
|
||||
log.info(f"After Phase 2: {unhealthy=}, {outdated=}")
|
||||
assert len(unhealthy) == 0
|
||||
assert len(outdated) == 0
|
||||
|
||||
unhealthy, outdated = update_and_query_metadata_health(
|
||||
env, healthy=[], unhealthy=tenant_a_shard_ids
|
||||
)
|
||||
|
||||
log.info(f"After Phase 3: {unhealthy=}, {outdated=}")
|
||||
assert len(unhealthy) == len(tenant_a_shard_ids)
|
||||
for t in tenant_a_shard_ids:
|
||||
assert str(t) in unhealthy
|
||||
assert len(outdated) == 0
|
||||
|
||||
# Phase 4: Delete A
|
||||
env.storage_controller.pageserver_api().tenant_delete(env.initial_tenant)
|
||||
|
||||
# A's unhealthy metadata health status should be deleted as well.
|
||||
assert env.storage_controller.metadata_health_is_healthy()
|
||||
|
||||
# All shards from B and C are not fresh if set outdated duration to 0 seconds.
|
||||
unhealthy, outdated = update_and_query_metadata_health(
|
||||
env, healthy=[], unhealthy=tenant_a_shard_ids, outdated_duration="0s"
|
||||
)
|
||||
assert len(unhealthy) == 0
|
||||
for t in tenant_b_shard_ids + tenant_c_shard_ids:
|
||||
assert str(t) in outdated
|
||||
|
||||
|
||||
def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test the `/control/v1/step_down` storage controller API. Upon receiving such
|
||||
|
||||
@@ -440,10 +440,12 @@ def test_scrubber_scan_pageserver_metadata(
|
||||
assert len(index.layer_metadata) > 0
|
||||
it = iter(index.layer_metadata.items())
|
||||
|
||||
scan_summary = env.storage_scrubber.scan_metadata()
|
||||
scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
|
||||
assert not scan_summary["with_warnings"]
|
||||
assert not scan_summary["with_errors"]
|
||||
|
||||
assert env.storage_controller.metadata_health_is_healthy()
|
||||
|
||||
# Delete a layer file that is listed in the index.
|
||||
layer, metadata = next(it)
|
||||
log.info(f"Deleting {timeline_path}/{layer.to_str()}")
|
||||
@@ -453,7 +455,17 @@ def test_scrubber_scan_pageserver_metadata(
|
||||
)
|
||||
log.info(f"delete response: {delete_response}")
|
||||
|
||||
# Check scan summary. Expect it to be a L0 layer so only emit warnings.
|
||||
# Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
|
||||
scan_summary = env.storage_scrubber.scan_metadata()
|
||||
log.info(f"{pprint.pformat(scan_summary)}")
|
||||
assert len(scan_summary["with_warnings"]) > 0
|
||||
|
||||
assert env.storage_controller.metadata_health_is_healthy()
|
||||
|
||||
# Now post to storage controller, expect seeing one unhealthy health record
|
||||
scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
|
||||
log.info(f"{pprint.pformat(scan_summary)}")
|
||||
assert len(scan_summary["with_warnings"]) > 0
|
||||
|
||||
unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
|
||||
assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
|
||||
|
||||
Reference in New Issue
Block a user