Compare commits

..

25 Commits

Author SHA1 Message Date
Joonas Koivunen
409e2eff9e fix: run upload_rewritten_layer in a span
there was a weird failure observed with CI tests: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8430/10108870590/index.html#suites/a1c2be32556270764423c495fad75d47/94a4686382b96297
2024-07-26 14:39:31 +00:00
Joonas Koivunen
e6e3b9a716 doc: remove on_gc_task_start fixme 2024-07-26 08:52:55 +00:00
Joonas Koivunen
7f31a3f671 forgotten rename, maybe 2024-07-26 08:52:55 +00:00
Joonas Koivunen
9971ae3d24 rename is_detached_from_{original_,}ancestor (just the rename) 2024-07-26 08:52:55 +00:00
Joonas Koivunen
48a2a20de3 chore: derive default 2024-07-26 08:52:55 +00:00
Joonas Koivunen
29ef8f15ce chore: unused variable 2024-07-26 08:52:55 +00:00
Joonas Koivunen
5e45dd3f86 rename SharedState::notify
to continue_existing_attempt
2024-07-26 08:52:55 +00:00
Joonas Koivunen
5fced442d7 warning caused by removed body 2024-07-26 08:52:55 +00:00
Joonas Koivunen
4222610233 cleanup index part dependent 2024-07-26 08:52:55 +00:00
Joonas Koivunen
92deb0dfd7 plumbing: collect timelines index parts 2024-07-26 08:52:55 +00:00
Joonas Koivunen
46ca6f17c5 plumbing: notify shared state of existing attempt 2024-07-26 08:52:55 +00:00
Joonas Koivunen
14869abb77 complete the plumbing with non-notifying attempt_blocks_gc impl 2024-07-26 08:52:55 +00:00
Joonas Koivunen
5330fd9366 doc(fixme): shared state 2024-07-26 08:52:55 +00:00
Joonas Koivunen
6c5b3b7812 doc: more sketched api comments 2024-07-26 08:52:55 +00:00
Joonas Koivunen
849fe0f191 plumb the shared state through
the api for the gc pausing is quite awkward.
2024-07-26 08:52:55 +00:00
Joonas Koivunen
f564b66f21 shared state sketch 2024-07-26 08:52:55 +00:00
Joonas Koivunen
2e58ccee78 temp: planning 2024-07-26 08:52:55 +00:00
Joonas Koivunen
f398ab0264 completion: Debug and query for barrier connection 2024-07-26 08:52:55 +00:00
Joonas Koivunen
f23ee2ccdb doc(test): be more accurate 2024-07-26 08:52:55 +00:00
Joonas Koivunen
0ad31bb7fb doc: remove obsolete FIXME
this was cleared with partial metadata updates.
2024-07-26 08:52:55 +00:00
Joonas Koivunen
86f26d0918 chore: minor rename FIXME in IndexPart 2024-07-26 08:52:55 +00:00
Joonas Koivunen
4a562dff2e doc: more 2024-07-26 08:52:55 +00:00
Joonas Koivunen
f9185b42a9 doc: minor enhancements 2024-07-26 08:52:55 +00:00
Joonas Koivunen
d4f30daa81 chore: minor indentation problem 2024-07-26 08:52:55 +00:00
Joonas Koivunen
97ab53e826 chore: add std::fmt::Debug for Barrier 2024-07-26 08:52:55 +00:00
106 changed files with 1572 additions and 5764 deletions

View File

@@ -14,8 +14,11 @@ inputs:
api_host:
description: 'Neon API host'
default: console-stage.neon.build
provisioner:
description: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod'
compute_units:
description: '[Min, Max] compute units'
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
default: '[1, 1]'
outputs:
@@ -34,6 +37,10 @@ runs:
# A shell without `set -x` to not to expose password/dsn in logs
shell: bash -euo pipefail {0}
run: |
if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
fi
project=$(curl \
"https://${API_HOST}/api/v2/projects" \
--fail \
@@ -45,7 +52,7 @@ runs:
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
\"pg_version\": ${POSTGRES_VERSION},
\"region_id\": \"${REGION_ID}\",
\"provisioner\": \"k8s-neonvm\",
\"provisioner\": \"${PROVISIONER}\",
\"autoscaling_limit_min_cu\": ${MIN_CU},
\"autoscaling_limit_max_cu\": ${MAX_CU},
\"settings\": { }
@@ -68,5 +75,6 @@ runs:
API_KEY: ${{ inputs.api_key }}
REGION_ID: ${{ inputs.region_id }}
POSTGRES_VERSION: ${{ inputs.postgres_version }}
PROVISIONER: ${{ inputs.provisioner }}
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}

View File

@@ -131,8 +131,8 @@ runs:
exit 1
fi
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
# -n sets the number of parallel processes that pytest-xdist will run
EXTRA_PARAMS="-n12 $EXTRA_PARAMS"
# -n16 uses sixteen processes to run tests via pytest-xdist
EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
# to the same worker to make @pytest.mark.order work with xdist

View File

@@ -19,10 +19,6 @@ on:
description: 'debug or release'
required: true
type: string
pg-versions:
description: 'a json array of postgres versions to run regression tests on'
required: true
type: string
defaults:
run:
@@ -258,7 +254,7 @@ jobs:
strategy:
fail-fast: false
matrix:
pg_version: ${{ fromJson(inputs.pg-versions) }}
pg_version: [ v14, v15, v16 ]
steps:
- uses: actions/checkout@v4
with:
@@ -282,11 +278,14 @@ jobs:
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ inputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_IMPL: vectored
PAGESERVER_GET_IMPL: vectored
PAGESERVER_VALIDATE_VEC_GET: true
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
- name: Merge and upload coverage data
if: |
false &&
inputs.build-type == 'debug' && matrix.pg_version == 'v16'
inputs.build-type == 'debug' && matrix.pg_version == 'v14'
uses: ./.github/actions/save-coverage-data

View File

@@ -63,9 +63,11 @@ jobs:
- DEFAULT_PG_VERSION: 16
PLATFORM: "neon-staging"
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
provisioner: 'k8s-pod'
- DEFAULT_PG_VERSION: 16
PLATFORM: "azure-staging"
region_id: 'azure-eastus2'
provisioner: 'k8s-neonvm'
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -98,6 +100,7 @@ jobs:
region_id: ${{ matrix.region_id }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
provisioner: ${{ matrix.provisioner }}
- name: Run benchmark
uses: ./.github/actions/run-python-test-set
@@ -213,11 +216,11 @@ jobs:
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
#
# Available platforms:
# - neonvm-captest-new: Freshly created project (1 CU)
# - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
# - neon-captest-new: Freshly created project (1 CU)
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
# - neonvm-captest-reuse: Reusing existing project
# - neon-captest-reuse: Reusing existing project
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
env:
@@ -242,16 +245,18 @@ jobs:
"'"$region_id_default"'"
],
"platform": [
"neonvm-captest-new",
"neonvm-captest-reuse",
"neon-captest-new",
"neon-captest-reuse",
"neonvm-captest-new"
],
"db_size": [ "10gb" ],
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
}'
@@ -266,7 +271,7 @@ jobs:
run: |
matrix='{
"platform": [
"neonvm-captest-reuse"
"neon-captest-reuse"
]
}'
@@ -282,7 +287,7 @@ jobs:
run: |
matrix='{
"platform": [
"neonvm-captest-reuse"
"neon-captest-reuse"
],
"scale": [
"10"
@@ -333,7 +338,7 @@ jobs:
prefix: latest
- name: Create Neon Project
if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
@@ -341,18 +346,19 @@ jobs:
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
- name: Set up Connection String
id: set-up-connstr
run: |
case "${PLATFORM}" in
neonvm-captest-reuse)
neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
;;
neonvm-captest-sharding-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
;;
neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;;
rds-aurora)
@@ -436,9 +442,9 @@ jobs:
fail-fast: false
matrix:
include:
- PLATFORM: "neonvm-captest-pgvector"
- PLATFORM: "neon-captest-pgvector"
- PLATFORM: "azure-captest-pgvector"
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -480,7 +486,7 @@ jobs:
id: set-up-connstr
run: |
case "${PLATFORM}" in
neonvm-captest-pgvector)
neon-captest-pgvector)
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
;;
azure-captest-pgvector)
@@ -579,7 +585,7 @@ jobs:
id: set-up-connstr
run: |
case "${PLATFORM}" in
neonvm-captest-reuse)
neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
;;
rds-aurora)
@@ -589,7 +595,7 @@ jobs:
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
@@ -666,7 +672,7 @@ jobs:
- name: Get Connstring Secret Name
run: |
case "${PLATFORM}" in
neonvm-captest-reuse)
neon-captest-reuse)
ENV_PLATFORM=CAPTEST_TPCH
;;
rds-aurora)
@@ -676,7 +682,7 @@ jobs:
ENV_PLATFORM=RDS_AURORA_TPCH
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
@@ -753,7 +759,7 @@ jobs:
id: set-up-connstr
run: |
case "${PLATFORM}" in
neonvm-captest-reuse)
neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
;;
rds-aurora)
@@ -763,7 +769,7 @@ jobs:
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac

View File

@@ -203,8 +203,7 @@ jobs:
fail-fast: false
matrix:
arch: [ x64 ]
# Do not build or run tests in debug for release branches
build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
build-type: [ debug, release ]
include:
- build-type: release
arch: arm64
@@ -214,8 +213,6 @@ jobs:
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
build-tag: ${{ needs.tag.outputs.build-tag }}
build-type: ${{ matrix.build-type }}
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
secrets: inherit
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -289,6 +286,9 @@ jobs:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_IMPL: vectored
PAGESERVER_GET_IMPL: vectored
PAGESERVER_VALIDATE_VEC_GET: false
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
@@ -309,7 +309,7 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
create-test-report:
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
outputs:
report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -836,9 +836,6 @@ jobs:
rm -rf .docker-custom
promote-images:
permissions:
contents: read # This is required for actions/checkout
id-token: write # This is required for Azure Login to work.
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
runs-on: ubuntu-22.04
@@ -865,28 +862,6 @@ jobs:
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
- name: Azure login
if: github.ref_name == 'main'
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
with:
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
- name: Login to ACR
if: github.ref_name == 'main'
run: |
az acr login --name=neoneastus2
- name: Copy docker images to ACR-dev
if: github.ref_name == 'main'
run: |
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
docker buildx imagetools create \
-t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
done
- name: Add latest tag to images
if: github.ref_name == 'main'
run: |

View File

@@ -13,7 +13,6 @@ on:
paths:
- '.github/workflows/pg-clients.yml'
- 'test_runner/pg_clients/**'
- 'test_runner/logical_repl/**'
- 'poetry.lock'
workflow_dispatch:
@@ -50,77 +49,6 @@ jobs:
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
secrets: inherit
test-logical-replication:
needs: [ build-build-tools-image ]
runs-on: ubuntu-22.04
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init --user root
services:
clickhouse:
image: clickhouse/clickhouse-server:24.6.3.64
ports:
- 9000:9000
- 8123:8123
steps:
- uses: actions/checkout@v4
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Create Neon Project
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
- name: Run tests
uses: ./.github/actions/run-python-test-set
with:
build_type: remote
test_selection: logical_repl
run_in_parallel: false
extra_params: -m remote_cluster
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
- name: Delete Neon Project
if: always()
uses: ./.github/actions/neon-project-delete
with:
project_id: ${{ steps.create-neon-project.outputs.project_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
if: ${{ !cancelled() }}
id: create-allure-report
uses: ./.github/actions/allure-report-generate
with:
store-test-results-into-db: true
env:
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
- name: Post to a Slack channel
if: github.event.schedule && failure()
uses: slackapi/slack-github-action@v1
with:
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
slack-message: |
Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
test-postgres-client-libs:
needs: [ build-build-tools-image ]
runs-on: ubuntu-22.04

View File

@@ -1,13 +1,13 @@
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
/storage_controller @neondatabase/storage
/libs/pageserver_api/ @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/libs/vm_monitor/ @neondatabase/autoscaling
/pageserver/ @neondatabase/storage
/pgxn/ @neondatabase/compute
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
/proxy/ @neondatabase/proxy
/safekeeper/ @neondatabase/storage
/safekeeper/ @neondatabase/safekeepers
/vendor/ @neondatabase/compute

187
Cargo.lock generated
View File

@@ -1418,7 +1418,7 @@ dependencies = [
"clap",
"criterion-plot",
"is-terminal",
"itertools 0.10.5",
"itertools",
"num-traits",
"once_cell",
"oorandom",
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
dependencies = [
"cast",
"itertools 0.10.5",
"itertools",
]
[[package]]
@@ -1672,7 +1672,6 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
dependencies = [
"bitflags 2.4.1",
"byteorder",
"chrono",
"diesel_derives",
"itoa",
"pq-sys",
@@ -2134,12 +2133,6 @@ dependencies = [
"slab",
]
[[package]]
name = "gen_ops"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
[[package]]
name = "generic-array"
version = "0.14.7"
@@ -2716,6 +2709,17 @@ version = "3.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
[[package]]
name = "io-lifetimes"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
dependencies = [
"hermit-abi",
"libc",
"windows-sys 0.48.0",
]
[[package]]
name = "io-uring"
version = "0.6.2"
@@ -2734,13 +2738,14 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
[[package]]
name = "is-terminal"
version = "0.4.12"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
dependencies = [
"hermit-abi",
"libc",
"windows-sys 0.52.0",
"io-lifetimes",
"rustix 0.37.25",
"windows-sys 0.48.0",
]
[[package]]
@@ -2752,15 +2757,6 @@ dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.6"
@@ -2875,6 +2871,18 @@ version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "linux-raw-sys"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
[[package]]
name = "linux-raw-sys"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
[[package]]
name = "linux-raw-sys"
version = "0.4.13"
@@ -2992,7 +3000,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
dependencies = [
"libc",
"measured",
"procfs",
"procfs 0.16.0",
]
[[package]]
@@ -3037,7 +3045,7 @@ dependencies = [
"measured",
"measured-process",
"once_cell",
"procfs",
"procfs 0.14.2",
"prometheus",
"rand 0.8.5",
"rand_distr",
@@ -3566,7 +3574,7 @@ dependencies = [
"humantime",
"humantime-serde",
"hyper 0.14.26",
"itertools 0.10.5",
"itertools",
"leaky-bucket",
"md5",
"metrics",
@@ -3584,9 +3592,8 @@ dependencies = [
"postgres_connection",
"postgres_ffi",
"pq_proto",
"procfs",
"procfs 0.14.2",
"rand 0.8.5",
"range-set-blaze",
"regex",
"remote_storage",
"reqwest 0.12.4",
@@ -3637,7 +3644,7 @@ dependencies = [
"hex",
"humantime",
"humantime-serde",
"itertools 0.10.5",
"itertools",
"postgres_ffi",
"rand 0.8.5",
"serde",
@@ -3695,7 +3702,7 @@ dependencies = [
"hex-literal",
"humantime",
"humantime-serde",
"itertools 0.10.5",
"itertools",
"metrics",
"once_cell",
"pageserver_api",
@@ -4027,7 +4034,7 @@ name = "postgres_connection"
version = "0.1.0"
dependencies = [
"anyhow",
"itertools 0.10.5",
"itertools",
"once_cell",
"postgres",
"tokio-postgres",
@@ -4085,7 +4092,7 @@ version = "0.1.0"
dependencies = [
"byteorder",
"bytes",
"itertools 0.10.5",
"itertools",
"pin-project-lite",
"postgres-protocol",
"rand 0.8.5",
@@ -4131,6 +4138,21 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "procfs"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
dependencies = [
"bitflags 1.3.2",
"byteorder",
"chrono",
"flate2",
"hex",
"lazy_static",
"rustix 0.36.16",
]
[[package]]
name = "procfs"
version = "0.16.0"
@@ -4138,12 +4160,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
dependencies = [
"bitflags 2.4.1",
"chrono",
"flate2",
"hex",
"lazy_static",
"procfs-core",
"rustix",
"rustix 0.38.28",
]
[[package]]
@@ -4153,15 +4173,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
dependencies = [
"bitflags 2.4.1",
"chrono",
"hex",
]
[[package]]
name = "prometheus"
version = "0.13.4"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
dependencies = [
"cfg-if",
"fnv",
@@ -4169,7 +4188,7 @@ dependencies = [
"libc",
"memchr",
"parking_lot 0.12.1",
"procfs",
"procfs 0.14.2",
"thiserror",
]
@@ -4191,7 +4210,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
dependencies = [
"bytes",
"heck 0.4.1",
"itertools 0.10.5",
"itertools",
"lazy_static",
"log",
"multimap",
@@ -4212,7 +4231,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
dependencies = [
"anyhow",
"itertools 0.10.5",
"itertools",
"proc-macro2",
"quote",
"syn 1.0.109",
@@ -4269,7 +4288,7 @@ dependencies = [
"hyper-util",
"indexmap 2.0.1",
"ipnet",
"itertools 0.10.5",
"itertools",
"lasso",
"md5",
"measured",
@@ -4445,18 +4464,6 @@ dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "range-set-blaze"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
dependencies = [
"gen_ops",
"itertools 0.12.1",
"num-integer",
"num-traits",
]
[[package]]
name = "rayon"
version = "1.7.0"
@@ -4625,7 +4632,7 @@ dependencies = [
"humantime",
"humantime-serde",
"hyper 0.14.26",
"itertools 0.10.5",
"itertools",
"metrics",
"once_cell",
"pin-project-lite",
@@ -4935,6 +4942,34 @@ dependencies = [
"nom",
]
[[package]]
name = "rustix"
version = "0.36.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
dependencies = [
"bitflags 1.3.2",
"errno",
"io-lifetimes",
"libc",
"linux-raw-sys 0.1.4",
"windows-sys 0.45.0",
]
[[package]]
name = "rustix"
version = "0.37.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
dependencies = [
"bitflags 1.3.2",
"errno",
"io-lifetimes",
"libc",
"linux-raw-sys 0.3.8",
"windows-sys 0.48.0",
]
[[package]]
name = "rustix"
version = "0.38.28"
@@ -5683,7 +5718,6 @@ dependencies = [
"aws-config",
"bytes",
"camino",
"chrono",
"clap",
"control_plane",
"diesel",
@@ -5694,7 +5728,7 @@ dependencies = [
"hex",
"humantime",
"hyper 0.14.26",
"itertools 0.10.5",
"itertools",
"lasso",
"measured",
"metrics",
@@ -5758,10 +5792,9 @@ dependencies = [
"either",
"futures",
"futures-util",
"git-version",
"hex",
"humantime",
"itertools 0.10.5",
"itertools",
"once_cell",
"pageserver",
"pageserver_api",
@@ -5938,15 +5971,15 @@ dependencies = [
[[package]]
name = "tempfile"
version = "3.9.0"
version = "3.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
dependencies = [
"cfg-if",
"fastrand 2.0.0",
"redox_syscall 0.4.1",
"rustix",
"windows-sys 0.52.0",
"fastrand 1.9.0",
"redox_syscall 0.3.5",
"rustix 0.37.25",
"windows-sys 0.45.0",
]
[[package]]
@@ -7143,6 +7176,15 @@ dependencies = [
"windows_x86_64_msvc 0.42.2",
]
[[package]]
name = "windows-sys"
version = "0.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
dependencies = [
"windows-targets 0.42.2",
]
[[package]]
name = "windows-sys"
version = "0.48.0"
@@ -7161,6 +7203,21 @@ dependencies = [
"windows-targets 0.52.4",
]
[[package]]
name = "windows-targets"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
dependencies = [
"windows_aarch64_gnullvm 0.42.2",
"windows_aarch64_msvc 0.42.2",
"windows_i686_gnu 0.42.2",
"windows_i686_msvc 0.42.2",
"windows_x86_64_gnu 0.42.2",
"windows_x86_64_gnullvm 0.42.2",
"windows_x86_64_msvc 0.42.2",
]
[[package]]
name = "windows-targets"
version = "0.48.0"
@@ -7390,7 +7447,7 @@ dependencies = [
"hmac",
"hyper 0.14.26",
"indexmap 1.9.3",
"itertools 0.10.5",
"itertools",
"libc",
"log",
"memchr",

View File

@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "51.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
procfs = "0.16"
procfs = "0.14"
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
rand = "0.8"

View File

@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.80.0
ENV RUSTC_VERSION=1.79.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \

View File

@@ -313,5 +313,3 @@ To get more familiar with this aspect, refer to:
- Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices.
- To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md).
- To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html
.

View File

@@ -514,6 +514,7 @@ impl LocalEnv {
#[derive(serde::Serialize, serde::Deserialize)]
// (allow unknown fields, unlike PageServerConf)
struct PageserverConfigTomlSubset {
id: NodeId,
listen_pg_addr: String,
listen_http_addr: String,
pg_auth_type: AuthType,
@@ -525,30 +526,18 @@ impl LocalEnv {
.with_context(|| format!("read {:?}", config_toml_path))?,
)
.context("parse pageserver.toml")?;
let identity_toml_path = dentry.path().join("identity.toml");
#[derive(serde::Serialize, serde::Deserialize)]
struct IdentityTomlSubset {
id: NodeId,
}
let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
&std::fs::read_to_string(&identity_toml_path)
.with_context(|| format!("read {:?}", identity_toml_path))?,
)
.context("parse identity.toml")?;
let PageserverConfigTomlSubset {
id: config_toml_id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
} = config_toml;
let IdentityTomlSubset {
id: identity_toml_id,
} = identity_toml;
let conf = PageServerConf {
id: {
anyhow::ensure!(
identity_toml_id == id,
"id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
config_toml_id == id,
"id mismatch: config_toml.id={config_toml_id} id={id}",
);
id
},

View File

@@ -127,13 +127,10 @@ impl PageServerNode {
}
// Apply the user-provided overrides
overrides.push({
let mut doc =
toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
// `id` is written out to `identity.toml` instead of `pageserver.toml`
doc.remove("id").expect("it's part of the struct");
doc.to_string()
});
overrides.push(
toml_edit::ser::to_string_pretty(&conf)
.expect("we deserialized this from toml earlier"),
);
// Turn `overrides` into a toml document.
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.

View File

@@ -1,6 +1,5 @@
use std::collections::HashSet;
use std::str::FromStr;
use std::time::{Duration, Instant};
use std::time::Instant;
/// Request/response types for the storage controller
/// API (`/control/v1` prefix). Implemented by the server
@@ -295,42 +294,6 @@ pub enum PlacementPolicy {
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateResponse {}
/// Metadata health record posted from scrubber.
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthRecord {
pub tenant_shard_id: TenantShardId,
pub healthy: bool,
pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthUpdateRequest {
pub healthy_tenant_shards: HashSet<TenantShardId>,
pub unhealthy_tenant_shards: HashSet<TenantShardId>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthUpdateResponse {}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListUnhealthyResponse {
pub unhealthy_tenant_shards: Vec<TenantShardId>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListOutdatedRequest {
#[serde(with = "humantime_serde")]
pub not_scrubbed_for: Duration,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListOutdatedResponse {
pub health_records: Vec<MetadataHealthRecord>,
}
#[cfg(test)]
mod test {
use super::*;

View File

@@ -29,7 +29,7 @@ use anyhow::{bail, Result};
use bytes::{Bytes, BytesMut};
/// Equivalent to sizeof(ControlFileData) in C
const SIZEOF_CONTROLDATA: usize = size_of::<ControlFileData>();
const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
impl ControlFileData {
/// Compute the offset of the `crc` field within the `ControlFileData` struct.

View File

@@ -31,7 +31,7 @@ pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
//
// Assumes 8 byte alignment
const SIZEOF_PAGE_HEADER_DATA: usize = size_of::<PageHeaderData>();
const SIZEOF_PAGE_HEADER_DATA: usize = std::mem::size_of::<PageHeaderData>();
pub const MAXALIGN_SIZE_OF_PAGE_HEADER_DATA: usize = (SIZEOF_PAGE_HEADER_DATA + 7) & !7;
//
@@ -191,7 +191,7 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::<XLogRecord>() as u32;
//
// from xlogrecord.h

View File

@@ -42,9 +42,9 @@ pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = size_of::<XLogPageHeaderData>();
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = size_of::<XLogLongPageHeaderData>();
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = size_of::<XLogRecord>();
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
#[allow(clippy::identity_op)]
pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
@@ -311,7 +311,7 @@ impl XLogLongPageHeaderData {
}
}
pub const SIZEOF_CHECKPOINT: usize = size_of::<CheckPoint>();
pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();
impl CheckPoint {
pub fn encode(&self) -> Result<Bytes, SerializeError> {

View File

@@ -178,7 +178,7 @@ pub fn test_find_end_of_wal_last_crossing_segment() {
/// currently 1024.
#[test]
pub fn test_update_next_xid() {
let checkpoint_buf = [0u8; size_of::<CheckPoint>()];
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
checkpoint.nextXid = FullTransactionId { value: 10 };
@@ -204,7 +204,7 @@ pub fn test_update_next_xid() {
#[test]
pub fn test_update_next_multixid() {
let checkpoint_buf = [0u8; size_of::<CheckPoint>()];
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
// simple case

View File

@@ -355,8 +355,7 @@ impl RemoteStorage for AzureBlobStorage {
.blobs()
.map(|k| ListingObject{
key: self.name_to_relative_path(&k.name),
last_modified: k.properties.last_modified.into(),
size: k.properties.content_length,
last_modified: k.properties.last_modified.into()
}
);

View File

@@ -144,7 +144,6 @@ impl RemotePath {
///
/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
/// NoDelimiter mode will only populate `keys`.
#[derive(Copy, Clone)]
pub enum ListingMode {
WithDelimiter,
NoDelimiter,
@@ -154,7 +153,6 @@ pub enum ListingMode {
pub struct ListingObject {
pub key: RemotePath,
pub last_modified: SystemTime,
pub size: u64,
}
#[derive(Default)]
@@ -196,7 +194,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
) -> impl Stream<Item = Result<Listing, DownloadError>>;
async fn list(
&self,
@@ -353,10 +351,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &'a CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
match self {
Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),

View File

@@ -368,7 +368,6 @@ impl RemoteStorage for LocalFs {
key: k.clone(),
// LocalFs is just for testing, so just specify a dummy time
last_modified: SystemTime::now(),
size: 0,
})
}
})
@@ -412,7 +411,6 @@ impl RemoteStorage for LocalFs {
key: RemotePath::from_string(&relative_key).unwrap(),
// LocalFs is just for testing
last_modified: SystemTime::now(),
size: 0,
});
}
}

View File

@@ -565,12 +565,9 @@ impl RemoteStorage for S3Bucket {
}
};
let size = object.size.unwrap_or(0) as u64;
result.keys.push(ListingObject{
key,
last_modified,
size,
last_modified
});
if let Some(mut mk) = max_keys {
assert!(mk > 0);

View File

@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
) -> impl Stream<Item = Result<Listing, DownloadError>> {
async_stream::stream! {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
.map_err(DownloadError::Other)?;

View File

@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum Scope {
/// Provides access to all data for a specific tenant (specified in `struct Claims` below)
// Provides access to all data for a specific tenant (specified in `struct Claims` below)
// TODO: join these two?
Tenant,
/// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
/// Should only be used e.g. for status check/tenant creation/list.
// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
// Should only be used e.g. for status check/tenant creation/list.
PageServerApi,
/// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
/// Should only be used e.g. for status check.
/// Currently also used for connection from any pageserver to any safekeeper.
// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
// Should only be used e.g. for status check.
// Currently also used for connection from any pageserver to any safekeeper.
SafekeeperData,
/// The scope used by pageservers in upcalls to storage controller and cloud control plane
// The scope used by pageservers in upcalls to storage controller and cloud control plane
#[serde(rename = "generations_api")]
GenerationsApi,
/// Allows access to control plane managment API and some storage controller endpoints.
// Allows access to control plane managment API and some storage controller endpoints.
Admin,
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state

View File

@@ -8,10 +8,33 @@ pub struct Completion {
_token: TaskTrackerToken,
}
impl std::fmt::Debug for Completion {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Completion")
.field("siblings", &self._token.task_tracker().len())
.finish()
}
}
impl Completion {
/// Returns true if this completion is associated with the given barrier.
pub fn blocks(&self, barrier: &Barrier) -> bool {
TaskTracker::ptr_eq(self._token.task_tracker(), &barrier.0)
}
}
/// Barrier will wait until all clones of [`Completion`] have been dropped.
#[derive(Clone)]
pub struct Barrier(TaskTracker);
impl std::fmt::Debug for Barrier {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Barrier")
.field("remaining", &self.0.len())
.finish()
}
}
impl Default for Barrier {
fn default() -> Self {
let (_, rx) = channel();

View File

@@ -49,7 +49,6 @@ postgres_backend.workspace = true
postgres-protocol.workspace = true
postgres-types.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true
scopeguard.workspace = true
serde.workspace = true

View File

@@ -1,4 +1,3 @@
use criterion::measurement::WallTime;
use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap;
@@ -16,11 +15,7 @@ use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
fn fixture_path(relative: &str) -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
}
use criterion::{black_box, criterion_group, criterion_main, Criterion};
fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
let mut layer_map = LayerMap::default();
@@ -114,7 +109,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
// between each test run.
fn bench_from_captest_env(c: &mut Criterion) {
// TODO consider compressing this file
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
// Test with uniform query pattern
@@ -144,7 +139,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
fn bench_from_real_project(c: &mut Criterion) {
// Init layer map
let now = Instant::now();
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
println!("Finished layer map init in {:?}", now.elapsed());
// Choose uniformly distributed queries
@@ -247,72 +242,7 @@ fn bench_sequential(c: &mut Criterion) {
group.finish();
}
fn bench_visibility_with_map(
group: &mut BenchmarkGroup<WallTime>,
layer_map: LayerMap,
read_points: Vec<Lsn>,
bench_name: &str,
) {
group.bench_function(bench_name, |b| {
b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
});
}
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
fn bench_visibility(c: &mut Criterion) {
let mut group = c.benchmark_group("visibility");
{
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
let now = Instant::now();
let mut layer_map = LayerMap::default();
let mut updates = layer_map.batch_update();
for i in 0..100_000 {
let i32 = (i as u32) % 100;
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
let layer = PersistentLayerDesc::new_img(
TenantShardId::unsharded(TenantId::generate()),
TimelineId::generate(),
zero.add(10 * i32)..zero.add(10 * i32 + 1),
Lsn(i),
0,
);
updates.insert_historic(layer);
}
updates.flush();
println!("Finished layer map init in {:?}", now.elapsed());
let mut read_points = Vec::new();
for i in (0..100_000).step_by(1000) {
read_points.push(Lsn(i));
}
bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
}
{
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
let read_points = vec![Lsn(0x1C760FA190)];
bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
let read_points = vec![
Lsn(0x1C760FA190),
Lsn(0x000000931BEAD539),
Lsn(0x000000931BF63011),
Lsn(0x000000931B33AE68),
Lsn(0x00000038E67ABFA0),
Lsn(0x000000931B33AE68),
Lsn(0x000000914E3F38F0),
Lsn(0x000000931B33AE68),
];
bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
}
group.finish();
}
criterion_group!(group_1, bench_from_captest_env);
criterion_group!(group_2, bench_from_real_project);
criterion_group!(group_3, bench_sequential);
criterion_group!(group_4, bench_visibility);
criterion_main!(group_1, group_2, group_3, group_4);
criterion_main!(group_1, group_2, group_3);

View File

@@ -17,9 +17,11 @@ use pageserver::config::PageserverIdentity;
use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::{secondary, TenantSharedResources};
use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
use pageserver::{
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
};
use remote_storage::GenericRemoteStorage;
use tokio::signal::unix::SignalKind;
use tokio::time::Instant;
@@ -29,9 +31,11 @@ use tracing::*;
use metrics::set_build_info_metric;
use pageserver::{
config::PageServerConf,
context::{DownloadBehavior, RequestContext},
deletion_queue::DeletionQueue,
http, page_cache, page_service, task_mgr,
task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
task_mgr::TaskKind,
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
tenant::mgr,
virtual_file,
};
@@ -125,7 +129,6 @@ fn main() -> anyhow::Result<()> {
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
info!(?conf.get_impl, "starting with get page implementation");
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
let tenants_path = conf.tenants_path();
if !tenants_path.exists() {
@@ -590,13 +593,30 @@ fn start_pageserver(
// Spawn a task to listen for libpq connections. It will spawn further tasks
// for each connection. We created the listener earlier already.
let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
pageserver_listener
.set_nonblocking(true)
.context("set listener to nonblocking")?;
tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
});
let libpq_listener = {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
TaskKind::LibpqEndpointListener,
// listener task shouldn't need to download anything. (We will
// create a separate sub-contexts for each connection, with their
// own download behavior. This context is used only to listen and
// accept connections.)
DownloadBehavior::Error,
);
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"libpq listener",
page_service::libpq_listener_main(
tenant_manager.clone(),
pg_auth,
pageserver_listener,
conf.pg_auth_type,
libpq_ctx,
cancel.clone(),
),
));
LibpqEndpointListener(CancellableTask { task, cancel })
};
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
@@ -624,7 +644,7 @@ fn start_pageserver(
shutdown_pageserver.take();
pageserver::shutdown_pageserver(
http_endpoint_listener,
page_service,
libpq_listener,
consumption_metrics_tasks,
disk_usage_eviction_task,
&tenant_manager,

View File

@@ -29,7 +29,6 @@ use utils::{
logging::LogFormat,
};
use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -53,7 +52,7 @@ pub mod defaults {
use pageserver_api::models::ImageCompressionAlgorithm;
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
@@ -84,16 +83,16 @@ pub mod defaults {
#[cfg(not(target_os = "linux"))]
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored";
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
pub const DEFAULT_GET_IMPL: &str = "vectored";
pub const DEFAULT_GET_IMPL: &str = "legacy";
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
ImageCompressionAlgorithm::Disabled;
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
@@ -296,10 +295,6 @@ pub struct PageServerConf {
pub ephemeral_bytes_per_memory_kb: usize,
pub l0_flush: L0FlushConfig,
/// This flag is temporary and will be removed after gradual rollout.
/// See <https://github.com/neondatabase/neon/issues/8184>.
pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -361,6 +356,8 @@ struct PageServerConfigBuilder {
auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
id: BuilderValue<NodeId>,
broker_endpoint: BuilderValue<Uri>,
broker_keepalive_interval: BuilderValue<Duration>,
@@ -406,13 +403,14 @@ struct PageServerConfigBuilder {
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
l0_flush: BuilderValue<L0FlushConfig>,
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
}
impl PageServerConfigBuilder {
fn new() -> Self {
Self::default()
fn new(node_id: NodeId) -> Self {
let mut this = Self::default();
this.id(node_id);
this
}
#[inline(always)]
@@ -440,6 +438,7 @@ impl PageServerConfigBuilder {
pg_auth_type: Set(AuthType::Trust),
auth_validation_public_key_path: Set(None),
remote_storage_config: Set(None),
id: NotSet,
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
.parse()
.expect("failed to parse default broker endpoint")),
@@ -497,7 +496,6 @@ impl PageServerConfigBuilder {
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
l0_flush: Set(L0FlushConfig::default()),
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
}
}
}
@@ -570,6 +568,10 @@ impl PageServerConfigBuilder {
self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
}
pub fn id(&mut self, node_id: NodeId) {
self.id = BuilderValue::Set(node_id)
}
pub fn log_format(&mut self, log_format: LogFormat) {
self.log_format = BuilderValue::Set(log_format)
}
@@ -681,11 +683,7 @@ impl PageServerConfigBuilder {
self.l0_flush = BuilderValue::Set(value);
}
pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
self.compact_level0_phase1_value_access = BuilderValue::Set(value);
}
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
pub fn build(self) -> anyhow::Result<PageServerConf> {
let default = Self::default_values();
macro_rules! conf {
@@ -718,6 +716,7 @@ impl PageServerConfigBuilder {
pg_auth_type,
auth_validation_public_key_path,
remote_storage_config,
id,
broker_endpoint,
broker_keepalive_interval,
log_format,
@@ -742,11 +741,9 @@ impl PageServerConfigBuilder {
image_compression,
ephemeral_bytes_per_memory_kb,
l0_flush,
compact_level0_phase1_value_access,
}
CUSTOM LOGIC
{
id: id,
// TenantConf is handled separately
default_tenant_conf: TenantConf::default(),
concurrent_tenant_warmup: ConfigurableSemaphore::new({
@@ -896,7 +893,7 @@ impl PageServerConf {
toml: &Document,
workdir: &Utf8Path,
) -> anyhow::Result<Self> {
let mut builder = PageServerConfigBuilder::new();
let mut builder = PageServerConfigBuilder::new(node_id);
builder.workdir(workdir.to_owned());
let mut t_conf = TenantConfOpt::default();
@@ -927,6 +924,8 @@ impl PageServerConf {
"tenant_config" => {
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
}
"id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
// Logging is not set up yet, so we can't do it.
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
"log_format" => builder.log_format(
@@ -1015,14 +1014,11 @@ impl PageServerConf {
"l0_flush" => {
builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
}
"compact_level0_phase1_value_access" => {
builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
}
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
let mut conf = builder.build(node_id).context("invalid config")?;
let mut conf = builder.build().context("invalid config")?;
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
let auth_validation_public_key_path = conf
@@ -1102,7 +1098,6 @@ impl PageServerConf {
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
}
}
}
@@ -1260,6 +1255,7 @@ max_file_descriptors = 333
# initial superuser role name to use when creating a new tenant
initial_superuser_name = 'zzzz'
id = 10
metric_collection_interval = '222 s'
metric_collection_endpoint = 'http://localhost:80/metrics'
@@ -1276,8 +1272,9 @@ background_task_maximum_delay = '334 s'
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
// we have to create dummy values to overcome the validation errors
let config_string =
format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
let config_string = format!(
"pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
);
let toml = config_string.parse()?;
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
@@ -1344,7 +1341,6 @@ background_task_maximum_delay = '334 s'
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
},
"Correct defaults should be used when no config values are provided"
);
@@ -1419,7 +1415,6 @@ background_task_maximum_delay = '334 s'
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
},
"Should be able to parse all basic config values correctly"
);
@@ -1584,6 +1579,7 @@ broker_endpoint = '{broker_endpoint}'
r#"pg_distrib_dir = "{pg_distrib_dir}"
metric_collection_endpoint = "http://sample.url"
metric_collection_interval = "10min"
id = 222
[disk_usage_based_eviction]
max_usage_pct = 80
@@ -1653,6 +1649,7 @@ threshold = "20m"
r#"pg_distrib_dir = "{pg_distrib_dir}"
metric_collection_endpoint = "http://sample.url"
metric_collection_interval = "10min"
id = 222
[tenant_config]
evictions_low_residence_duration_metric_threshold = "20m"

View File

@@ -296,11 +296,6 @@ impl From<GetActiveTenantError> for ApiError {
GetActiveTenantError::WaitForActiveTimeout { .. } => {
ApiError::ResourceUnavailable(format!("{}", e).into())
}
GetActiveTenantError::SwitchedTenant => {
// in our HTTP handlers, this error doesn't happen
// TODO: separate error types
ApiError::ResourceUnavailable("switched tenant".into())
}
}
}
}
@@ -1816,7 +1811,7 @@ async fn timeline_detach_ancestor_handler(
// drop(tenant);
let resp = match progress {
detach_ancestor::Progress::Prepared(_guard, prepared) => {
detach_ancestor::Progress::Prepared(attempt, prepared) => {
// it would be great to tag the guard on to the tenant activation future
let reparented_timelines = state
.tenant_manager
@@ -1824,6 +1819,7 @@ async fn timeline_detach_ancestor_handler(
tenant_shard_id,
timeline_id,
prepared,
attempt,
ctx,
)
.await
@@ -1837,6 +1833,7 @@ async fn timeline_detach_ancestor_handler(
detach_ancestor::Progress::Done(resp) => resp,
};
// FIXME: if the ordering is really needed and not a hashset, move it here?
json_response(StatusCode::OK, resp)
}
.instrument(span)
@@ -2134,24 +2131,14 @@ async fn secondary_download_handler(
let timeout = wait.unwrap_or(Duration::MAX);
let result = tokio::time::timeout(
let status = match tokio::time::timeout(
timeout,
state.secondary_controller.download_tenant(tenant_shard_id),
)
.await;
let progress = secondary_tenant.progress.lock().unwrap().clone();
let status = match result {
Ok(Ok(())) => {
if progress.layers_downloaded >= progress.layers_total {
// Download job ran to completion
StatusCode::OK
} else {
// Download dropped out without errors because it ran out of time budget
StatusCode::ACCEPTED
}
}
.await
{
// Download job ran to completion.
Ok(Ok(())) => StatusCode::OK,
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
// okay. We could get an error here in the unlikely edge case that the tenant
// was detached between our check above and executing the download job.
@@ -2161,6 +2148,8 @@ async fn secondary_download_handler(
Err(_) => StatusCode::ACCEPTED,
};
let progress = secondary_tenant.progress.lock().unwrap().clone();
json_response(status, progress)
}

View File

@@ -2,23 +2,13 @@ use std::{num::NonZeroUsize, sync::Arc};
use crate::tenant::ephemeral_file;
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
pub enum L0FlushConfig {
#[default]
PageCached,
#[serde(rename_all = "snake_case")]
Direct {
max_concurrency: NonZeroUsize,
},
}
impl Default for L0FlushConfig {
fn default() -> Self {
Self::Direct {
// TODO: using num_cpus results in different peak memory usage on different instance types.
max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
}
}
Direct { max_concurrency: NonZeroUsize },
}
#[derive(Clone)]

View File

@@ -12,8 +12,6 @@ pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub mod l0_flush;
use futures::{stream::FuturesUnordered, StreamExt};
pub use pageserver_api::keyspace;
use tokio_util::sync::CancellationToken;
pub mod aux_file;
@@ -32,13 +30,14 @@ pub mod walingest;
pub mod walrecord;
pub mod walredo;
use crate::task_mgr::TaskKind;
use camino::Utf8Path;
use deletion_queue::DeletionQueue;
use tenant::{
mgr::{BackgroundPurges, TenantManager},
secondary,
};
use tracing::{info, info_span};
use tracing::info;
/// Current storage format version
///
@@ -64,6 +63,7 @@ pub struct CancellableTask {
pub cancel: CancellationToken,
}
pub struct HttpEndpointListener(pub CancellableTask);
pub struct LibpqEndpointListener(pub CancellableTask);
pub struct ConsumptionMetricsTasks(pub CancellableTask);
pub struct DiskUsageEvictionTask(pub CancellableTask);
impl CancellableTask {
@@ -77,7 +77,7 @@ impl CancellableTask {
#[allow(clippy::too_many_arguments)]
pub async fn shutdown_pageserver(
http_listener: HttpEndpointListener,
page_service: page_service::Listener,
libpq_listener: LibpqEndpointListener,
consumption_metrics_worker: ConsumptionMetricsTasks,
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
tenant_manager: &TenantManager,
@@ -87,83 +87,10 @@ pub async fn shutdown_pageserver(
exit_code: i32,
) {
use std::time::Duration;
// If the orderly shutdown below takes too long, we still want to make
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
//
// (Leftover walredo processes are the hypothesized trigger for the systemd freezes
// that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
//
// We use a thread instead of a tokio task because the background runtime is likely busy
// with the final flushing / uploads. This activity here has priority, and due to lack
// of scheduling priority feature sin the tokio scheduler, using a separate thread is
// an effective priority booster.
let walredo_extraordinary_shutdown_thread_span = {
let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
span.follows_from(tracing::Span::current());
span
};
let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
let walredo_extraordinary_shutdown_thread = std::thread::spawn({
let walredo_extraordinary_shutdown_thread_cancel =
walredo_extraordinary_shutdown_thread_cancel.clone();
move || {
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let _entered = rt.enter();
let _entered = walredo_extraordinary_shutdown_thread_span.enter();
if let Ok(()) = rt.block_on(tokio::time::timeout(
Duration::from_secs(8),
walredo_extraordinary_shutdown_thread_cancel.cancelled(),
)) {
info!("cancellation requested");
return;
}
let managers = tenant::WALREDO_MANAGERS
.lock()
.unwrap()
// prevents new walredo managers from being inserted
.take()
.expect("only we take()");
// Use FuturesUnordered to get in queue early for each manager's
// heavier_once_cell semaphore wait list.
// Also, for idle tenants that for some reason haven't
// shut down yet, it's quite likely that we're not going
// to get Poll::Pending once.
let mut futs: FuturesUnordered<_> = managers
.into_iter()
.filter_map(|(_, mgr)| mgr.upgrade())
.map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
.collect();
info!(count=%futs.len(), "built FuturesUnordered");
let mut last_log_at = std::time::Instant::now();
#[derive(Debug, Default)]
struct Results {
initiated: u64,
already: u64,
}
let mut results = Results::default();
while let Some(we_initiated) = rt.block_on(futs.next()) {
if we_initiated {
results.initiated += 1;
} else {
results.already += 1;
}
if last_log_at.elapsed() > Duration::from_millis(100) {
info!(remaining=%futs.len(), ?results, "progress");
last_log_at = std::time::Instant::now();
}
}
info!(?results, "done");
}
});
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
let remaining_connections = timed(
page_service.stop_accepting(),
timed(
libpq_listener.0.shutdown(),
"shutdown LibpqEndpointListener",
Duration::from_secs(1),
)
@@ -181,7 +108,7 @@ pub async fn shutdown_pageserver(
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
// should already have been canclled via mgr::shutdown_all_tenants
timed(
remaining_connections.shutdown(),
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
"shutdown PageRequestHandlers",
Duration::from_secs(1),
)
@@ -235,12 +162,6 @@ pub async fn shutdown_pageserver(
Duration::from_secs(1),
)
.await;
info!("cancel & join walredo_extraordinary_shutdown_thread");
walredo_extraordinary_shutdown_thread_cancel.cancel();
walredo_extraordinary_shutdown_thread.join().unwrap();
info!("walredo_extraordinary_shutdown_thread done");
info!("Shut down successfully completed");
std::process::exit(exit_code);
}

View File

@@ -525,15 +525,6 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_visible_physical_size",
"The size of the layer files present in the pageserver's filesystem.",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_resident_physical_size_global",
@@ -622,23 +613,7 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_in_bytes_total",
"Size of data written into image layers before compression"
)
.expect("failed to define a metric")
});
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_in_bytes_considered",
"Size of potentially compressible data written into image layers before compression"
)
.expect("failed to define a metric")
});
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_in_bytes_chosen",
"Size of data whose compressed form was written into image layers"
"Size of uncompressed data written into image layers"
)
.expect("failed to define a metric")
});
@@ -2213,7 +2188,6 @@ pub(crate) struct TimelineMetrics {
pub(crate) layer_count_delta: UIntGauge,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
pub visible_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub aux_file_size_gauge: IntGauge,
@@ -2336,9 +2310,6 @@ impl TimelineMetrics {
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
// TODO: we shouldn't expose this metric
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2393,7 +2364,6 @@ impl TimelineMetrics {
layer_count_delta,
standby_horizon_gauge,
resident_physical_size_gauge,
visible_physical_size_gauge,
current_logical_size_gauge,
aux_file_size_gauge,
directory_entries_count_gauge,
@@ -2445,7 +2415,6 @@ impl TimelineMetrics {
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
}
let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);

File diff suppressed because it is too large Load Diff

View File

@@ -8,7 +8,8 @@ use std::time::Duration;
pub use pageserver_api::key::{Key, KEY_SIZE};
/// A 'value' stored for a one Key.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(test, derive(PartialEq))]
pub enum Value {
/// An Image value contains a full copy of the value
Image(Bytes),

View File

@@ -33,9 +33,9 @@ use remote_storage::GenericRemoteStorage;
use remote_storage::TimeoutOrCancel;
use std::collections::BTreeMap;
use std::fmt;
use std::sync::Weak;
use std::time::SystemTime;
use storage_broker::BrokerClientChannel;
use timeline::detach_ancestor;
use tokio::io::BufReader;
use tokio::sync::watch;
use tokio::task::JoinSet;
@@ -103,7 +103,8 @@ use std::fmt::Debug;
use std::fmt::Display;
use std::fs;
use std::fs::File;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::sync::Mutex;
use std::time::{Duration, Instant};
@@ -300,8 +301,11 @@ pub struct Tenant {
pub(crate) timeline_get_throttle:
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
/// An ongoing timeline detach must be checked during attempts to GC a timeline.
///
/// After starting the timeline detach ancestor, blocking GC until it completes allows retrying
/// the ancestor detach, until we can be certain that all reparentings have been done.
ongoing_timeline_detach: timeline::detach_ancestor::SharedState,
l0_flush_global_state: L0FlushGlobalState,
}
@@ -313,66 +317,14 @@ impl std::fmt::Debug for Tenant {
}
pub(crate) enum WalRedoManager {
Prod(WalredoManagerId, PostgresRedoManager),
Prod(PostgresRedoManager),
#[cfg(test)]
Test(harness::TestRedoManager),
}
#[derive(thiserror::Error, Debug)]
#[error("pageserver is shutting down")]
pub(crate) struct GlobalShutDown;
impl WalRedoManager {
pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
let id = WalredoManagerId::next();
let arc = Arc::new(Self::Prod(id, mgr));
let mut guard = WALREDO_MANAGERS.lock().unwrap();
match &mut *guard {
Some(map) => {
map.insert(id, Arc::downgrade(&arc));
Ok(arc)
}
None => Err(GlobalShutDown),
}
}
}
impl Drop for WalRedoManager {
fn drop(&mut self) {
match self {
Self::Prod(id, _) => {
let mut guard = WALREDO_MANAGERS.lock().unwrap();
if let Some(map) = &mut *guard {
map.remove(id).expect("new() registers, drop() unregisters");
}
}
#[cfg(test)]
Self::Test(_) => {
// Not applicable to test redo manager
}
}
}
}
/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
/// the walredo processes outside of the regular order.
///
/// This is necessary to work around a systemd bug where it freezes if there are
/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
#[allow(clippy::type_complexity)]
pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
pub(crate) struct WalredoManagerId(u64);
impl WalredoManagerId {
pub fn next() -> Self {
static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
if id == 0 {
panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
}
Self(id)
impl From<PostgresRedoManager> for WalRedoManager {
fn from(mgr: PostgresRedoManager) -> Self {
Self::Prod(mgr)
}
}
@@ -384,20 +336,19 @@ impl From<harness::TestRedoManager> for WalRedoManager {
}
impl WalRedoManager {
pub(crate) async fn shutdown(&self) -> bool {
pub(crate) async fn shutdown(&self) {
match self {
Self::Prod(_, mgr) => mgr.shutdown().await,
Self::Prod(mgr) => mgr.shutdown().await,
#[cfg(test)]
Self::Test(_) => {
// Not applicable to test redo manager
true
}
}
}
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
match self {
Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
#[cfg(test)]
Self::Test(_) => {
// Not applicable to test redo manager
@@ -417,7 +368,7 @@ impl WalRedoManager {
pg_version: u32,
) -> Result<bytes::Bytes, walredo::Error> {
match self {
Self::Prod(_, mgr) => {
Self::Prod(mgr) => {
mgr.request_redo(key, lsn, base_img, records, pg_version)
.await
}
@@ -431,7 +382,7 @@ impl WalRedoManager {
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
match self {
WalRedoManager::Prod(_, m) => Some(m.status()),
WalRedoManager::Prod(m) => Some(m.status()),
#[cfg(test)]
WalRedoManager::Test(_) => None,
}
@@ -440,8 +391,6 @@ impl WalRedoManager {
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
pub enum GetTimelineError {
#[error("Timeline is shutting down")]
ShuttingDown,
#[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
NotActive {
tenant_id: TenantShardId,
@@ -730,10 +679,13 @@ impl Tenant {
shard_identity: ShardIdentity,
init_order: Option<InitializationOrder>,
mode: SpawnMode,
existing_detach_attempt: Option<&detach_ancestor::Attempt>,
ctx: &RequestContext,
) -> Result<Arc<Tenant>, GlobalShutDown> {
let wal_redo_manager =
WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
) -> Arc<Tenant> {
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
conf,
tenant_shard_id,
)));
let TenantSharedResources {
broker_client,
@@ -757,6 +709,12 @@ impl Tenant {
l0_flush_global_state,
));
if let Some(attempt) = existing_detach_attempt {
tenant
.ongoing_timeline_detach
.continue_existing_attempt(attempt);
}
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
// we shut down while attaching.
let attach_gate_guard = tenant
@@ -809,9 +767,9 @@ impl Tenant {
// The Stopping case is for when we have passed control on to DeleteTenantFlow:
// if it errors, we will call make_broken when tenant is already in Stopping.
assert!(
matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
"the attach task owns the tenant state until activation is complete"
);
matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
"the attach task owns the tenant state until activation is complete"
);
*state = TenantState::broken_from_reason(err.to_string());
});
@@ -932,7 +890,7 @@ impl Tenant {
}
.instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
);
Ok(tenant)
tenant
}
#[instrument(skip_all)]
@@ -1036,6 +994,8 @@ impl Tenant {
}
}
let mut shared_state_builder = timeline::detach_ancestor::SharedStateBuilder::default();
// For every timeline, download the metadata file, scan the local directory,
// and build a layer map that contains an entry for each remote and local
// layer file.
@@ -1045,6 +1005,8 @@ impl Tenant {
.remove(&timeline_id)
.expect("just put it in above");
shared_state_builder.record_loading_timeline(&timeline_id, &index_part);
// TODO again handle early failure
self.load_remote_timeline(
timeline_id,
@@ -1089,6 +1051,8 @@ impl Tenant {
// IndexPart is the source of truth.
self.clean_up_timelines(&existent_timelines)?;
shared_state_builder.build(&self.ongoing_timeline_detach);
fail::fail_point!("attach-before-activate", |_| {
anyhow::bail!("attach-before-activate");
});
@@ -1280,29 +1244,11 @@ impl Tenant {
Ok(timeline_preloads)
}
pub(crate) async fn apply_timeline_archival_config(
pub async fn apply_timeline_archival_config(
&self,
timeline_id: TimelineId,
state: TimelineArchivalState,
_timeline_id: TimelineId,
_config: TimelineArchivalState,
) -> anyhow::Result<()> {
let timeline = self
.get_timeline(timeline_id, false)
.context("Cannot apply timeline archival config to inexistent timeline")?;
let upload_needed = timeline
.remote_client
.schedule_index_upload_for_timeline_archival_state(state)?;
if upload_needed {
const MAX_WAIT: Duration = Duration::from_secs(10);
let Ok(v) =
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
else {
tracing::warn!("reached timeout for waiting on upload queue");
bail!("reached timeout for upload queue flush");
};
v?;
}
Ok(())
}
@@ -1634,7 +1580,7 @@ impl Tenant {
self: Arc<Self>,
timeline_id: TimelineId,
) -> Result<(), DeleteTimelineError> {
DeleteTimelineFlow::run(&self, timeline_id).await?;
DeleteTimelineFlow::run(&self, timeline_id, false).await?;
Ok(())
}
@@ -1679,6 +1625,11 @@ impl Tenant {
}
}
if self.ongoing_timeline_detach.attempt_blocks_gc() {
info!("Skipping GC while there is an ongoing detach_ancestor attempt");
return Ok(GcResult::default());
}
self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
.await
}
@@ -1687,23 +1638,21 @@ impl Tenant {
/// This function is periodically called by compactor task.
/// Also it can be explicitly requested per timeline through page server
/// api's 'compact' command.
///
/// Returns whether we have pending compaction task.
async fn compaction_iteration(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<bool, timeline::CompactionError> {
) -> Result<(), timeline::CompactionError> {
// Don't start doing work during shutdown, or when broken, we do not need those in the logs
if !self.is_active() {
return Ok(false);
return Ok(());
}
{
let conf = self.tenant_conf.load();
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
info!("Skipping compaction in location state {:?}", conf.location);
return Ok(false);
return Ok(());
}
}
@@ -1730,13 +1679,11 @@ impl Tenant {
// Before doing any I/O work, check our circuit breaker
if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
info!("Skipping compaction due to previous failures");
return Ok(false);
return Ok(());
}
let mut has_pending_task = false;
for (timeline_id, timeline) in &timelines_to_compact {
has_pending_task |= timeline
timeline
.compact(cancel, EnumSet::empty(), ctx)
.instrument(info_span!("compact_timeline", %timeline_id))
.await
@@ -1756,7 +1703,7 @@ impl Tenant {
.unwrap()
.success(&CIRCUIT_BREAKERS_UNBROKEN);
Ok(has_pending_task)
Ok(())
}
// Call through to all timelines to freeze ephemeral layers if needed. Usually
@@ -2690,7 +2637,7 @@ impl Tenant {
&crate::metrics::tenant_throttling::TIMELINE_GET,
)),
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
ongoing_timeline_detach: std::sync::Mutex::default(),
ongoing_timeline_detach: Default::default(),
l0_flush_global_state,
}
}
@@ -6963,11 +6910,7 @@ mod tests {
vec![
// Image layer at GC horizon
PersistentLayerKey {
key_range: {
let mut key = Key::MAX;
key.field6 -= 1;
Key::MIN..key
},
key_range: Key::MIN..Key::MAX,
lsn_range: Lsn(0x30)..Lsn(0x31),
is_delta: false
},
@@ -6986,15 +6929,6 @@ mod tests {
]
);
// increase GC horizon and compact again
{
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Lsn(0x40);
guard.cutoffs.space = Lsn(0x40);
}
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
Ok(())
}
@@ -7346,15 +7280,6 @@ mod tests {
);
}
// increase GC horizon and compact again
{
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Lsn(0x40);
guard.cutoffs.space = Lsn(0x40);
}
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
Ok(())
}
@@ -7423,7 +7348,6 @@ mod tests {
Lsn(0x60),
&[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
3,
None,
)
.await
.unwrap();
@@ -7548,7 +7472,7 @@ mod tests {
),
];
let res = tline
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
.await
.unwrap();
let expected_res = KeyHistoryRetention {
@@ -7594,114 +7518,6 @@ mod tests {
};
assert_eq!(res, expected_res);
// In case of branch compaction, the branch itself does not have the full history, and we need to provide
// the ancestor image in the test case.
let history = vec![
(
key,
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
),
(
key,
Lsn(0x30),
Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
),
(
key,
Lsn(0x40),
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
),
(
key,
Lsn(0x70),
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
),
];
let res = tline
.generate_key_retention(
key,
&history,
Lsn(0x60),
&[],
3,
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
)
.await
.unwrap();
let expected_res = KeyHistoryRetention {
below_horizon: vec![(
Lsn(0x60),
KeyLogAtLsn(vec![(
Lsn(0x60),
Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
)]),
)],
above_horizon: KeyLogAtLsn(vec![(
Lsn(0x70),
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
)]),
};
assert_eq!(res, expected_res);
let history = vec![
(
key,
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
),
(
key,
Lsn(0x40),
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
),
(
key,
Lsn(0x60),
Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
),
(
key,
Lsn(0x70),
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
),
];
let res = tline
.generate_key_retention(
key,
&history,
Lsn(0x60),
&[Lsn(0x30)],
3,
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
)
.await
.unwrap();
let expected_res = KeyHistoryRetention {
below_horizon: vec![
(
Lsn(0x30),
KeyLogAtLsn(vec![(
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
)]),
),
(
Lsn(0x60),
KeyLogAtLsn(vec![(
Lsn(0x60),
Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
)]),
),
],
above_horizon: KeyLogAtLsn(vec![(
Lsn(0x70),
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
)]),
};
assert_eq!(res, expected_res);
Ok(())
}
@@ -7859,10 +7675,6 @@ mod tests {
];
let verify_result = || async {
let gc_horizon = {
let gc_info = tline.gc_info.read().unwrap();
gc_info.cutoffs.time
};
for idx in 0..10 {
assert_eq!(
tline
@@ -7873,7 +7685,7 @@ mod tests {
);
assert_eq!(
tline
.get(get_key(idx as u32), gc_horizon, &ctx)
.get(get_key(idx as u32), Lsn(0x30), &ctx)
.await
.unwrap(),
&expected_result_at_gc_horizon[idx]
@@ -7899,205 +7711,6 @@ mod tests {
let cancel = CancellationToken::new();
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
verify_result().await;
// compact again
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
verify_result().await;
// increase GC horizon and compact again
{
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Lsn(0x38);
guard.cutoffs.space = Lsn(0x38);
}
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
// not increasing the GC horizon and compact again
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
verify_result().await;
Ok(())
}
#[tokio::test]
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
let (tenant, ctx) = harness.load().await;
fn get_key(id: u32) -> Key {
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
let img_layer = (0..10)
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
.collect_vec();
let delta1 = vec![
(
get_key(1),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
),
(
get_key(2),
Lsn(0x30),
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
),
(
get_key(3),
Lsn(0x28),
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
),
(
get_key(3),
Lsn(0x30),
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
),
(
get_key(3),
Lsn(0x40),
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
),
];
let delta2 = vec![
(
get_key(5),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
),
(
get_key(6),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
),
];
let delta3 = vec![
(
get_key(8),
Lsn(0x48),
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
),
(
get_key(9),
Lsn(0x48),
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
),
];
let parent_tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![], // delta layers
vec![(Lsn(0x18), img_layer)], // image layers
Lsn(0x18),
)
.await?;
parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
let branch_tline = tenant
.branch_timeline_test_with_layers(
&parent_tline,
NEW_TIMELINE_ID,
Some(Lsn(0x18)),
&ctx,
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
], // delta layers
vec![], // image layers
Lsn(0x50),
)
.await?;
branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
{
// Update GC info
let mut guard = parent_tline.gc_info.write().unwrap();
*guard = GcInfo {
retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
cutoffs: GcCutoffs {
time: Lsn(0x10),
space: Lsn(0x10),
},
leases: Default::default(),
within_ancestor_pitr: false,
};
}
{
// Update GC info
let mut guard = branch_tline.gc_info.write().unwrap();
*guard = GcInfo {
retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
cutoffs: GcCutoffs {
time: Lsn(0x50),
space: Lsn(0x50),
},
leases: Default::default(),
within_ancestor_pitr: false,
};
}
let expected_result_at_gc_horizon = [
Bytes::from_static(b"value 0@0x10"),
Bytes::from_static(b"value 1@0x10@0x20"),
Bytes::from_static(b"value 2@0x10@0x30"),
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
Bytes::from_static(b"value 4@0x10"),
Bytes::from_static(b"value 5@0x10@0x20"),
Bytes::from_static(b"value 6@0x10@0x20"),
Bytes::from_static(b"value 7@0x10"),
Bytes::from_static(b"value 8@0x10@0x48"),
Bytes::from_static(b"value 9@0x10@0x48"),
];
let expected_result_at_lsn_40 = [
Bytes::from_static(b"value 0@0x10"),
Bytes::from_static(b"value 1@0x10@0x20"),
Bytes::from_static(b"value 2@0x10@0x30"),
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
Bytes::from_static(b"value 4@0x10"),
Bytes::from_static(b"value 5@0x10@0x20"),
Bytes::from_static(b"value 6@0x10@0x20"),
Bytes::from_static(b"value 7@0x10"),
Bytes::from_static(b"value 8@0x10"),
Bytes::from_static(b"value 9@0x10"),
];
let verify_result = || async {
for idx in 0..10 {
assert_eq!(
branch_tline
.get(get_key(idx as u32), Lsn(0x50), &ctx)
.await
.unwrap(),
&expected_result_at_gc_horizon[idx]
);
assert_eq!(
branch_tline
.get(get_key(idx as u32), Lsn(0x40), &ctx)
.await
.unwrap(),
&expected_result_at_lsn_40[idx]
);
}
};
verify_result().await;
let cancel = CancellationToken::new();
branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
verify_result().await;

View File

@@ -28,12 +28,6 @@ use crate::virtual_file::VirtualFile;
use std::cmp::min;
use std::io::{Error, ErrorKind};
#[derive(Copy, Clone, Debug)]
pub struct CompressionInfo {
pub written_compressed: bool,
pub compressed_size: Option<usize>,
}
impl<'a> BlockCursor<'a> {
/// Read a blob into a new buffer.
pub async fn read_blob(
@@ -279,10 +273,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
srcbuf: B,
ctx: &RequestContext,
) -> (B::Buf, Result<u64, Error>) {
let (buf, res) = self
.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
.await;
(buf, res.map(|(off, _compression_info)| off))
self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
.await
}
/// Write a blob of data. Returns the offset that it was written to,
@@ -292,12 +284,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
srcbuf: B,
ctx: &RequestContext,
algorithm: ImageCompressionAlgorithm,
) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
) -> (B::Buf, Result<u64, Error>) {
let offset = self.offset;
let mut compression_info = CompressionInfo {
written_compressed: false,
compressed_size: None,
};
let len = srcbuf.bytes_init();
@@ -340,9 +328,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
encoder.write_all(&slice[..]).await.unwrap();
encoder.shutdown().await.unwrap();
let compressed = encoder.into_inner();
compression_info.compressed_size = Some(compressed.len());
if compressed.len() < len {
compression_info.written_compressed = true;
let compressed_len = compressed.len();
compressed_buf = Some(compressed);
(BYTE_ZSTD, compressed_len, slice.into_inner())
@@ -373,7 +359,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
} else {
self.write_all(srcbuf, ctx).await
};
(srcbuf, res.map(|_| (offset, compression_info)))
(srcbuf, res.map(|_| offset))
}
}
@@ -430,14 +416,12 @@ pub(crate) mod tests {
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
for blob in blobs.iter() {
let (_, res) = if compression {
let res = wtr
.write_blob_maybe_compressed(
blob.clone(),
ctx,
ImageCompressionAlgorithm::Zstd { level: Some(1) },
)
.await;
(res.0, res.1.map(|(off, _)| off))
wtr.write_blob_maybe_compressed(
blob.clone(),
ctx,
ImageCompressionAlgorithm::Zstd { level: Some(1) },
)
.await
} else {
wtr.write_blob(blob.clone(), ctx).await
};

View File

@@ -296,19 +296,13 @@ where
let mut stack = Vec::new();
stack.push((self.root_blk, None));
let block_cursor = self.reader.block_cursor();
let mut node_buf = [0_u8; PAGE_SZ];
while let Some((node_blknum, opt_iter)) = stack.pop() {
// Read the node, through the PS PageCache, into local variable `node_buf`.
// We could keep the page cache read guard alive, but, at the time of writing,
// we run quite small PS PageCache s => can't risk running out of
// PageCache space because this stream isn't consumed fast enough.
let page_read_guard = block_cursor
// Locate the node.
let node_buf = block_cursor
.read_blk(self.start_blk + node_blknum, ctx)
.await?;
node_buf.copy_from_slice(page_read_guard.as_ref());
drop(page_read_guard); // drop page cache read guard early
let node = OnDiskNode::deparse(&node_buf)?;
let node = OnDiskNode::deparse(node_buf.as_ref())?;
let prefix_len = node.prefix_len as usize;
let suffix_len = node.suffix_len as usize;
@@ -351,7 +345,6 @@ where
Either::Left(idx..node.num_children.into())
};
// idx points to the first match now. Keep going from there
while let Some(idx) = iter.next() {
let key_off = idx * suffix_len;

View File

@@ -51,8 +51,7 @@ use crate::keyspace::KeyPartitioning;
use crate::repository::Key;
use crate::tenant::storage_layer::InMemoryLayer;
use anyhow::Result;
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
use pageserver_api::keyspace::KeySpaceAccum;
use std::collections::{HashMap, VecDeque};
use std::iter::Peekable;
use std::ops::Range;
@@ -62,7 +61,7 @@ use utils::lsn::Lsn;
use historic_layer_coverage::BufferedHistoricLayerCoverage;
pub use historic_layer_coverage::LayerKey;
use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
use super::storage_layer::PersistentLayerDesc;
///
/// LayerMap tracks what layers exist on a timeline.
@@ -872,183 +871,11 @@ impl LayerMap {
println!("End dump LayerMap");
Ok(())
}
/// `read_points` represent the tip of a timeline and any branch points, i.e. the places
/// where we expect to serve reads.
///
/// This function is O(N) and should be called infrequently. The caller is responsible for
/// looking up and updating the Layer objects for these layer descriptors.
pub fn get_visibility(
&self,
mut read_points: Vec<Lsn>,
) -> (
Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
KeySpace,
) {
// This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
// KeySpace is intended to be composed statically and iterated over.
struct KeyShadow {
// Map of range start to range end
inner: RangeSetBlaze<i128>,
}
impl KeyShadow {
fn new() -> Self {
Self {
inner: Default::default(),
}
}
fn contains(&self, range: Range<Key>) -> bool {
let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
CheckSortedDisjoint::from([range_incl]),
))
}
/// Add the input range to the keys covered by self.
///
/// Return true if inserting this range covered some keys that were previously not covered
fn cover(&mut self, insert: Range<Key>) -> bool {
let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
self.inner.ranges_insert(range_incl)
}
fn reset(&mut self) {
self.inner = Default::default();
}
fn to_keyspace(&self) -> KeySpace {
let mut accum = KeySpaceAccum::new();
for range_incl in self.inner.ranges() {
let range = Range {
start: Key::from_i128(*range_incl.start()),
end: Key::from_i128(range_incl.end() + 1),
};
accum.add_range(range)
}
accum.to_keyspace()
}
}
// The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
// and a ReadPoint
read_points.sort_by_key(|rp| rp.0);
let mut shadow = KeyShadow::new();
// We will interleave all our read points and layers into a sorted collection
enum Item {
ReadPoint { lsn: Lsn },
Layer(Arc<PersistentLayerDesc>),
}
let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
items.extend(self.iter_historic_layers().map(Item::Layer));
items.extend(
read_points
.into_iter()
.map(|rp| Item::ReadPoint { lsn: rp }),
);
// Ordering: we want to iterate like this:
// 1. Highest LSNs first
// 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
// 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
items.sort_by_key(|item| {
std::cmp::Reverse(match item {
Item::Layer(layer) => {
if layer.is_delta() {
(Lsn(layer.get_lsn_range().end.0 - 1), 0)
} else {
(layer.image_layer_lsn(), 1)
}
}
Item::ReadPoint { lsn } => (*lsn, 2),
})
});
let mut results = Vec::with_capacity(self.historic.len());
let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
for item in items {
let (reached_lsn, is_readpoint) = match &item {
Item::ReadPoint { lsn } => (lsn, true),
Item::Layer(layer) => (&layer.lsn_range.start, false),
};
maybe_covered_deltas.retain(|d| {
if *reached_lsn >= d.lsn_range.start && is_readpoint {
// We encountered a readpoint within the delta layer: it is visible
results.push((d.clone(), LayerVisibilityHint::Visible));
false
} else if *reached_lsn < d.lsn_range.start {
// We passed the layer's range without encountering a read point: it is not visible
results.push((d.clone(), LayerVisibilityHint::Covered));
false
} else {
// We're still in the delta layer: continue iterating
true
}
});
match item {
Item::ReadPoint { lsn: _lsn } => {
// TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
// to assume that the whole key range is visible at the branch point.
shadow.reset();
}
Item::Layer(layer) => {
let visibility = if layer.is_delta() {
if shadow.contains(layer.get_key_range()) {
// If a layer isn't visible based on current state, we must defer deciding whether
// it is truly not visible until we have advanced past the delta's range: we might
// encounter another branch point within this delta layer's LSN range.
maybe_covered_deltas.push(layer);
continue;
} else {
LayerVisibilityHint::Visible
}
} else {
let modified = shadow.cover(layer.get_key_range());
if modified {
// An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
LayerVisibilityHint::Visible
} else {
// An image layer in a region that was already covered
LayerVisibilityHint::Covered
}
};
results.push((layer, visibility));
}
}
}
// Drain any remaining maybe_covered deltas
results.extend(
maybe_covered_deltas
.into_iter()
.map(|d| (d, LayerVisibilityHint::Covered)),
);
(results, shadow.to_keyspace())
}
}
#[cfg(test)]
mod tests {
use crate::tenant::{storage_layer::LayerName, IndexPart};
use pageserver_api::{
key::DBDIR_KEY,
keyspace::{KeySpace, KeySpaceRandomAccum},
};
use std::{collections::HashMap, path::PathBuf};
use utils::{
id::{TenantId, TimelineId},
shard::TenantShardId,
};
use pageserver_api::keyspace::KeySpace;
use super::*;
@@ -1175,299 +1002,4 @@ mod tests {
}
}
}
#[test]
fn layer_visibility_basic() {
// A simple synthetic input, as a smoke test.
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
let timeline_id = TimelineId::generate();
let mut layer_map = LayerMap::default();
let mut updates = layer_map.batch_update();
const FAKE_LAYER_SIZE: u64 = 1024;
let inject_delta = |updates: &mut BatchedUpdates,
key_start: i128,
key_end: i128,
lsn_start: u64,
lsn_end: u64| {
let desc = PersistentLayerDesc::new_delta(
tenant_shard_id,
timeline_id,
Range {
start: Key::from_i128(key_start),
end: Key::from_i128(key_end),
},
Range {
start: Lsn(lsn_start),
end: Lsn(lsn_end),
},
1024,
);
updates.insert_historic(desc.clone());
desc
};
let inject_image =
|updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
let desc = PersistentLayerDesc::new_img(
tenant_shard_id,
timeline_id,
Range {
start: Key::from_i128(key_start),
end: Key::from_i128(key_end),
},
Lsn(lsn),
FAKE_LAYER_SIZE,
);
updates.insert_historic(desc.clone());
desc
};
//
// Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
// we expect to handle. You can follow these examples through in the same order as they would be processed
// by the function under test.
//
let mut read_points = vec![Lsn(1000)];
// A delta ahead of any image layer
let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
// An image layer is visible and covers some layers beneath itself
let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
// A delta layer covered by the image layer: should be covered
let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
// A delta layer partially covered by an image layer: should be visible
let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
// A delta layer not covered by an image layer: should be visible
let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
// An image layer covered by the image layer above: should be covered
let covered_image = inject_image(&mut updates, 10, 20, 89);
// An image layer partially covered by an image layer: should be visible
let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
// An image layer not covered by an image layer: should be visible
let not_covered_image = inject_image(&mut updates, 1, 4, 89);
// A read point: this will make subsequent layers below here visible, even if there are
// more recent layers covering them.
read_points.push(Lsn(80));
// A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
// A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
// the read point should make it visible, even though its end LSN is covered
let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
read_points.push(Lsn(65));
let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
updates.flush();
let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
assert_eq!(
layer_visibilities.get(&ahead_layer),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&visible_covering_img),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&covered_delta),
Some(&LayerVisibilityHint::Covered)
);
assert_eq!(
layer_visibilities.get(&partially_covered_delta),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&not_covered_delta),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&covered_image),
Some(&LayerVisibilityHint::Covered)
);
assert_eq!(
layer_visibilities.get(&partially_covered_image),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&not_covered_image),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&covered_delta_below_read_point),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&covering_img_between_read_points),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&covered_delta_between_read_points),
Some(&LayerVisibilityHint::Covered)
);
assert_eq!(
layer_visibilities.get(&covered_delta_intersects_read_point),
Some(&LayerVisibilityHint::Visible)
);
assert_eq!(
layer_visibilities.get(&visible_img_after_last_read_point),
Some(&LayerVisibilityHint::Visible)
);
// Shadow should include all the images below the last read point
let expected_shadow = KeySpace {
ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
};
assert_eq!(shadow, expected_shadow);
}
fn fixture_path(relative: &str) -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
}
#[test]
fn layer_visibility_realistic() {
// Load a large example layermap
let index_raw = std::fs::read_to_string(fixture_path(
"test_data/indices/mixed_workload/index_part.json",
))
.unwrap();
let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
let tenant_id = TenantId::generate();
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let timeline_id = TimelineId::generate();
let mut layer_map = LayerMap::default();
let mut updates = layer_map.batch_update();
for (layer_name, layer_metadata) in index.layer_metadata {
let layer_desc = match layer_name {
LayerName::Image(layer_name) => PersistentLayerDesc {
key_range: layer_name.key_range.clone(),
lsn_range: layer_name.lsn_as_range(),
tenant_shard_id,
timeline_id,
is_delta: false,
file_size: layer_metadata.file_size,
},
LayerName::Delta(layer_name) => PersistentLayerDesc {
key_range: layer_name.key_range,
lsn_range: layer_name.lsn_range,
tenant_shard_id,
timeline_id,
is_delta: true,
file_size: layer_metadata.file_size,
},
};
updates.insert_historic(layer_desc);
}
updates.flush();
let read_points = vec![index.metadata.disk_consistent_lsn()];
let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
for (layer_desc, visibility) in &layer_visibilities {
tracing::info!("{layer_desc:?}: {visibility:?}");
eprintln!("{layer_desc:?}: {visibility:?}");
}
// The shadow should be non-empty, since there were some image layers
assert!(!shadow.ranges.is_empty());
// At least some layers should be marked covered
assert!(layer_visibilities
.iter()
.any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
// Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
for (layer_desc, visible) in &layer_visibilities {
let mut coverage = KeySpaceRandomAccum::new();
let mut covered_by = Vec::new();
for other_layer in layer_map.iter_historic_layers() {
if &other_layer == layer_desc {
continue;
}
if !other_layer.is_delta()
&& other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
&& other_layer.key_range.start <= layer_desc.key_range.end
&& layer_desc.key_range.start <= other_layer.key_range.end
{
coverage.add_range(other_layer.get_key_range());
covered_by.push((*other_layer).clone());
}
}
let coverage = coverage.to_keyspace();
let expect_visible = if coverage.ranges.len() == 1
&& coverage.contains(&layer_desc.key_range.start)
&& coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
{
LayerVisibilityHint::Covered
} else {
LayerVisibilityHint::Visible
};
if expect_visible != *visible {
eprintln!(
"Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
layer_desc.key_range.start,
layer_desc.key_range.end,
layer_desc.lsn_range.start,
layer_desc.lsn_range.end,
layer_desc.is_delta()
);
if expect_visible == LayerVisibilityHint::Covered {
eprintln!("Covered by:");
for other in covered_by {
eprintln!(
" {}..{} @ {}",
other.get_key_range().start,
other.get_key_range().end,
other.image_layer_lsn()
);
}
if let Some(range) = coverage.ranges.first() {
eprintln!(
"Total coverage from contributing layers: {}..{}",
range.start, range.end
);
} else {
eprintln!(
"Total coverage from contributing layers: {:?}",
coverage.ranges
);
}
}
}
assert_eq!(expect_visible, *visible);
}
// Sanity: the layer that holds latest data for the DBDIR key should always be visible
// (just using this key as a key that will always exist for any layermap fixture)
let dbdir_layer = layer_map
.search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
.unwrap();
assert!(matches!(
layer_visibilities.get(&dbdir_layer.layer).unwrap(),
LayerVisibilityHint::Visible
));
}
}

View File

@@ -521,10 +521,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
Ok(&self.historic_coverage)
}
pub(crate) fn len(&self) -> usize {
self.layers.len()
}
}
#[test]

View File

@@ -111,7 +111,7 @@ impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
#[error("re-serializing for crc32 failed")]
struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);
const METADATA_HDR_SIZE: usize = size_of::<TimelineMetadataHeader>();
const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
struct TimelineMetadataBodyV2 {

View File

@@ -54,8 +54,8 @@ use utils::id::{TenantId, TimelineId};
use super::remote_timeline_client::remote_tenant_path;
use super::secondary::SecondaryTenant;
use super::timeline::detach_ancestor::PreparedTimelineDetach;
use super::{GlobalShutDown, TenantSharedResources};
use super::timeline::detach_ancestor::{self, PreparedTimelineDetach};
use super::TenantSharedResources;
/// For a tenant that appears in TenantsMap, it may either be
/// - `Attached`: has a full Tenant object, is elegible to service
@@ -116,6 +116,8 @@ pub(crate) enum ShardSelector {
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
/// ignore it.
Zero,
/// Pick the first shard we find for the TenantId
First,
/// Pick the shard that holds this key
Page(Key),
/// The shard ID is known: pick the given shard
@@ -665,20 +667,18 @@ pub async fn init_tenant_mgr(
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
let shard_identity = location_conf.shard;
let slot = match location_conf.mode {
LocationMode::Attached(attached_conf) => TenantSlot::Attached(
tenant_spawn(
conf,
tenant_shard_id,
&tenant_dir_path,
resources.clone(),
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
shard_identity,
Some(init_order.clone()),
SpawnMode::Lazy,
&ctx,
)
.expect("global shutdown during init_tenant_mgr cannot happen"),
),
LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
conf,
tenant_shard_id,
&tenant_dir_path,
resources.clone(),
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
shard_identity,
Some(init_order.clone()),
SpawnMode::Lazy,
None,
&ctx,
)),
LocationMode::Secondary(secondary_conf) => {
info!(
tenant_id = %tenant_shard_id.tenant_id,
@@ -725,8 +725,9 @@ fn tenant_spawn(
shard_identity: ShardIdentity,
init_order: Option<InitializationOrder>,
mode: SpawnMode,
existing_detach_attempt: Option<&detach_ancestor::Attempt>,
ctx: &RequestContext,
) -> Result<Arc<Tenant>, GlobalShutDown> {
) -> Arc<Tenant> {
// All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
// path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode
// to avoid impacting prod runtime performance.
@@ -745,6 +746,7 @@ fn tenant_spawn(
shard_identity,
init_order,
mode,
existing_detach_attempt,
ctx,
)
}
@@ -1192,11 +1194,9 @@ impl TenantManager {
shard_identity,
None,
spawn_mode,
None,
ctx,
)
.map_err(|_: GlobalShutDown| {
UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
})?;
);
TenantSlot::Attached(tenant)
}
@@ -1316,8 +1316,9 @@ impl TenantManager {
shard_identity,
None,
SpawnMode::Eager,
None,
ctx,
)?;
);
slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -1388,32 +1389,34 @@ impl TenantManager {
tenant_shard_id: TenantShardId,
) -> Result<(), DeleteTenantError> {
let remote_path = remote_tenant_path(&tenant_shard_id);
let mut keys_stream = self.resources.remote_storage.list_streaming(
Some(&remote_path),
remote_storage::ListingMode::NoDelimiter,
None,
&self.cancel,
);
while let Some(chunk) = keys_stream.next().await {
let keys = match chunk {
Ok(listing) => listing.keys,
Err(remote_storage::DownloadError::Cancelled) => {
return Err(DeleteTenantError::Cancelled)
}
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
};
if keys.is_empty() {
tracing::info!("Remote storage already deleted");
} else {
tracing::info!("Deleting {} keys from remote storage", keys.len());
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
self.resources
.remote_storage
.delete_objects(&keys, &self.cancel)
.await?;
let keys = match self
.resources
.remote_storage
.list(
Some(&remote_path),
remote_storage::ListingMode::NoDelimiter,
None,
&self.cancel,
)
.await
{
Ok(listing) => listing.keys,
Err(remote_storage::DownloadError::Cancelled) => {
return Err(DeleteTenantError::Cancelled)
}
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
};
if keys.is_empty() {
tracing::info!("Remote storage already deleted");
} else {
tracing::info!("Deleting {} keys from remote storage", keys.len());
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
self.resources
.remote_storage
.delete_objects(&keys, &self.cancel)
.await?;
}
Ok(())
@@ -1970,8 +1973,10 @@ impl TenantManager {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
prepared: PreparedTimelineDetach,
mut attempt: detach_ancestor::Attempt,
ctx: &RequestContext,
) -> Result<Vec<TimelineId>, anyhow::Error> {
// FIXME: this is unnecessary, slotguard already has these semantics
struct RevertOnDropSlot(Option<SlotGuard>);
impl Drop for RevertOnDropSlot {
@@ -2019,12 +2024,14 @@ impl TenantManager {
let timeline = tenant.get_timeline(timeline_id, true)?;
let reparented = timeline
let resp = timeline
.complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
.await?;
let mut slot_guard = slot_guard.into_inner();
attempt.before_shutdown();
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, ShutdownMode::Hard).await {
Ok(()) => {
@@ -2050,12 +2057,18 @@ impl TenantManager {
shard_identity,
None,
SpawnMode::Eager,
Some(&attempt),
ctx,
)?;
);
slot_guard.upsert(TenantSlot::Attached(tenant))?;
slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
Ok(reparented)
tenant
.ongoing_timeline_detach
.complete(attempt, &tenant)
.await?;
Ok(resp)
}
/// A page service client sends a TenantId, and to look up the correct Tenant we must
@@ -2092,6 +2105,7 @@ impl TenantManager {
};
match selector {
ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
return ShardResolveResult::Found(tenant.clone())
}
@@ -2173,9 +2187,6 @@ pub(crate) enum GetActiveTenantError {
/// never happen.
#[error("Tenant is broken: {0}")]
Broken(String),
#[error("reconnect to switch tenant id")]
SwitchedTenant,
}
#[derive(Debug, thiserror::Error)]

View File

@@ -187,7 +187,7 @@ use camino::Utf8Path;
use chrono::{NaiveDateTime, Utc};
pub(crate) use download::download_initdb_tar_zst;
use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::shard::{ShardIndex, TenantShardId};
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
@@ -457,17 +457,6 @@ impl RemoteTimelineClient {
.unwrap_or(false)
}
/// Returns whether the timeline is archived.
/// Return None if the remote index_part hasn't been downloaded yet.
pub(crate) fn is_archived(&self) -> Option<bool> {
self.upload_queue
.lock()
.unwrap()
.initialized_mut()
.map(|q| q.clean.0.archived_at.is_some())
.ok()
}
fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
current_remote_index_part
@@ -628,7 +617,7 @@ impl RemoteTimelineClient {
Ok(())
}
/// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
/// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
self: &Arc<Self>,
last_aux_file_policy: Option<AuxFilePolicy>,
@@ -639,48 +628,6 @@ impl RemoteTimelineClient {
self.schedule_index_upload(upload_queue)?;
Ok(())
}
/// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
///
/// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
/// so either if the change is already sitting in the queue, but not commited yet, or the change has not
/// been in the queue yet.
pub(crate) fn schedule_index_upload_for_timeline_archival_state(
self: &Arc<Self>,
state: TimelineArchivalState,
) -> anyhow::Result<bool> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
/// Returns Some(_) if a change is needed, and Some(true) if it's a
/// change needed to set archived_at.
fn need_change(
archived_at: &Option<NaiveDateTime>,
state: TimelineArchivalState,
) -> Option<bool> {
match (archived_at, state) {
(Some(_), TimelineArchivalState::Archived)
| (None, TimelineArchivalState::Unarchived) => {
// Nothing to do
tracing::info!("intended state matches present state");
None
}
(None, TimelineArchivalState::Archived) => Some(true),
(Some(_), TimelineArchivalState::Unarchived) => Some(false),
}
}
let need_upload_scheduled = need_change(&upload_queue.dirty.archived_at, state);
if let Some(archived_at_set) = need_upload_scheduled {
let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc());
upload_queue.dirty.archived_at = intended_archived_at;
self.schedule_index_upload(upload_queue)?;
}
let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some();
Ok(need_wait)
}
///
/// Launch an index-file upload operation in the background, if necessary.
///
@@ -740,8 +687,6 @@ impl RemoteTimelineClient {
self: &Arc<Self>,
new_parent: &TimelineId,
) -> anyhow::Result<()> {
// FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
// and reads the in-memory part we cannot do the detaching like this
let receiver = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
@@ -1378,18 +1323,6 @@ impl RemoteTimelineClient {
.dirty
.layer_metadata
.drain()
.filter(|(_file_name, meta)| {
// Filter out layers that belonged to an ancestor shard. Since we are deleting the whole timeline from
// all shards anyway, we _could_ delete these, but
// - it creates a potential race if other shards are still
// using the layers while this shard deletes them.
// - it means that if we rolled back the shard split, the ancestor shards would be in a state where
// these timelines are present but corrupt (their index exists but some layers don't)
//
// These layers will eventually be cleaned up by the scrubber when it does physical GC.
meta.shard.shard_number == self.tenant_shard_id.shard_number
&& meta.shard.shard_count == self.tenant_shard_id.shard_count
})
.map(|(file_name, meta)| {
remote_layer_path(
&self.tenant_shard_id.tenant_id,

View File

@@ -32,10 +32,6 @@ pub struct IndexPart {
#[serde(skip_serializing_if = "Option::is_none")]
pub deleted_at: Option<NaiveDateTime>,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub archived_at: Option<NaiveDateTime>,
/// Per layer file name metadata, which can be present for a present or missing layer file.
///
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -84,11 +80,10 @@ impl IndexPart {
/// - 5: lineage was added
/// - 6: last_aux_file_policy is added.
/// - 7: metadata_bytes is no longer written, but still read
/// - 8: added `archived_at`
const LATEST_VERSION: usize = 8;
const LATEST_VERSION: usize = 7;
// Versions we may see when reading from a bucket.
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7];
pub const FILE_NAME: &'static str = "index_part.json";
@@ -99,7 +94,6 @@ impl IndexPart {
disk_consistent_lsn: metadata.disk_consistent_lsn(),
metadata,
deleted_at: None,
archived_at: None,
lineage: Default::default(),
last_aux_file_policy: None,
}
@@ -242,7 +236,8 @@ impl Lineage {
.is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
}
pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
/// Returns true if the timeline originally had an ancestor, and no longer has one.
pub(crate) fn is_detached_from_ancestor(&self) -> bool {
self.original_ancestor.is_some()
}
@@ -290,7 +285,6 @@ mod tests {
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: None,
archived_at: None,
lineage: Lineage::default(),
last_aux_file_policy: None,
};
@@ -333,7 +327,6 @@ mod tests {
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: None,
archived_at: None,
lineage: Lineage::default(),
last_aux_file_policy: None,
};
@@ -377,7 +370,6 @@ mod tests {
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
archived_at: None,
lineage: Lineage::default(),
last_aux_file_policy: None,
};
@@ -424,7 +416,6 @@ mod tests {
])
.unwrap(),
deleted_at: None,
archived_at: None,
lineage: Lineage::default(),
last_aux_file_policy: None,
};
@@ -466,7 +457,6 @@ mod tests {
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
archived_at: None,
lineage: Lineage::default(),
last_aux_file_policy: None,
};
@@ -507,7 +497,6 @@ mod tests {
disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: None,
archived_at: None,
lineage: Lineage {
reparenting_history_truncated: false,
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -557,7 +546,6 @@ mod tests {
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
archived_at: None,
lineage: Lineage {
reparenting_history_truncated: false,
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -616,63 +604,6 @@ mod tests {
14,
).with_recalculated_checksum().unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
archived_at: None,
lineage: Default::default(),
last_aux_file_policy: Default::default(),
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
#[test]
fn v8_indexpart_is_parsed() {
let example = r#"{
"version": 8,
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata": {
"disk_consistent_lsn": "0/16960E8",
"prev_record_lsn": "0/1696070",
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
"ancestor_lsn": "0/0",
"latest_gc_cutoff_lsn": "0/1696070",
"initdb_lsn": "0/1696070",
"pg_version": 14
},
"deleted_at": "2023-07-31T09:00:00.123",
"archived_at": "2023-04-29T09:00:00.123"
}"#;
let expected = IndexPart {
version: 8,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
file_size: 9007199254741001,
generation: Generation::none(),
shard: ShardIndex::unsharded()
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::new(
Lsn::from_str("0/16960E8").unwrap(),
Some(Lsn::from_str("0/1696070").unwrap()),
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
Lsn::INVALID,
Lsn::from_str("0/1696070").unwrap(),
Lsn::from_str("0/1696070").unwrap(),
14,
).with_recalculated_checksum().unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
lineage: Default::default(),
last_aux_file_policy: Default::default(),
};

View File

@@ -451,14 +451,20 @@ pub enum ValueReconstructResult {
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should
/// be used for cache management but not for correctness-critical checks.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum LayerVisibilityHint {
#[derive(Default, Debug, Clone, PartialEq, Eq)]
pub(crate) enum LayerVisibilityHint {
/// A Visible layer might be read while serving a read, because there is not an image layer between it
/// and a readable LSN (the tip of the branch or a child's branch point)
Visible,
/// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
/// a branch or ephemeral endpoint at an LSN below the layer that covers this.
#[allow(unused)]
Covered,
/// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
/// in this state. Note that newly written layers may be called Visible immediately, this uninitialized
/// state is for when existing layers are constructed while loading a timeline.
#[default]
Uninitialized,
}
pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
@@ -620,29 +626,22 @@ impl LayerAccessStats {
}
}
/// Helper for extracting the visibility hint from the literal value of our inner u64
fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
1 => LayerVisibilityHint::Visible,
0 => LayerVisibilityHint::Covered,
_ => unreachable!(),
}
}
/// Returns the old value which has been replaced
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
let value = match visibility {
LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
LayerVisibilityHint::Covered => 0x0,
LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
};
let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
self.decode_visibility(old_bits)
self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
}
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
self.decode_visibility(read)
match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
1 => LayerVisibilityHint::Visible,
0 => LayerVisibilityHint::Covered,
_ => unreachable!(),
}
}
}

View File

@@ -307,10 +307,12 @@ impl DeltaLayer {
.with_context(|| format!("Failed to load delta layer {}", self.path()))
}
async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result<Arc<DeltaLayerInner>> {
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
let path = self.path();
let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?;
let loaded = DeltaLayerInner::load(&path, None, None, ctx)
.await
.and_then(|res| res)?;
// not production code
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -467,7 +469,7 @@ impl DeltaLayerWriterInner {
.write_blob_maybe_compressed(val, ctx, compression)
.await;
let off = match res {
Ok((off, _)) => off,
Ok(off) => off,
Err(e) => return (val, Err(anyhow::anyhow!(e))),
};
@@ -758,24 +760,27 @@ impl DeltaLayerInner {
&self.layer_lsn_range
}
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
/// - inner has the success or transient failure
/// - outer has the permanent failure
pub(super) async fn load(
path: &Utf8Path,
summary: Option<Summary>,
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path, ctx)
.await
.context("open layer file")?;
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
let file = match VirtualFile::open(path, ctx).await {
Ok(file) => file,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
};
let file_id = page_cache::next_file_id();
let block_reader = FileBlockReader::new(&file, file_id);
let summary_blk = block_reader
.read_blk(0, ctx)
.await
.context("read first block")?;
let summary_blk = match block_reader.read_blk(0, ctx).await {
Ok(blk) => blk,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
};
// TODO: this should be an assertion instead; see ImageLayerInner::load
let actual_summary =
@@ -797,7 +802,7 @@ impl DeltaLayerInner {
}
}
Ok(DeltaLayerInner {
Ok(Ok(DeltaLayerInner {
file,
file_id,
index_start_blk: actual_summary.index_start_blk,
@@ -805,7 +810,7 @@ impl DeltaLayerInner {
max_vectored_read_bytes,
layer_key_range: actual_summary.key_range,
layer_lsn_range: actual_summary.lsn_range,
})
}))
}
pub(super) async fn get_value_reconstruct_data(

View File

@@ -265,8 +265,9 @@ impl ImageLayer {
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
let path = self.path();
let loaded =
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?;
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
.await
.and_then(|res| res)?;
// not production code
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -384,16 +385,17 @@ impl ImageLayerInner {
summary: Option<Summary>,
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path, ctx)
.await
.context("open layer file")?;
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
let file = match VirtualFile::open(path, ctx).await {
Ok(file) => file,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
};
let file_id = page_cache::next_file_id();
let block_reader = FileBlockReader::new(&file, file_id);
let summary_blk = block_reader
.read_blk(0, ctx)
.await
.context("read first block")?;
let summary_blk = match block_reader.read_blk(0, ctx).await {
Ok(blk) => blk,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
};
// length is the only way how this could fail, so it's not actually likely at all unless
// read_blk returns wrong sized block.
@@ -418,7 +420,7 @@ impl ImageLayerInner {
}
}
Ok(ImageLayerInner {
Ok(Ok(ImageLayerInner {
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
lsn,
@@ -426,7 +428,7 @@ impl ImageLayerInner {
file_id,
max_vectored_read_bytes,
key_range: actual_summary.key_range,
})
}))
}
pub(super) async fn get_value_reconstruct_data(
@@ -734,14 +736,6 @@ struct ImageLayerWriterInner {
// Total uncompressed bytes passed into put_image
uncompressed_bytes: u64,
// Like `uncompressed_bytes`,
// but only of images we might consider for compression
uncompressed_bytes_eligible: u64,
// Like `uncompressed_bytes`, but only of images
// where we have chosen their compressed form
uncompressed_bytes_chosen: u64,
blob_writer: BlobWriter<false>,
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
}
@@ -798,8 +792,6 @@ impl ImageLayerWriterInner {
tree: tree_builder,
blob_writer,
uncompressed_bytes: 0,
uncompressed_bytes_eligible: 0,
uncompressed_bytes_chosen: 0,
};
Ok(writer)
@@ -818,22 +810,13 @@ impl ImageLayerWriterInner {
) -> anyhow::Result<()> {
ensure!(self.key_range.contains(&key));
let compression = self.conf.image_compression;
let uncompressed_len = img.len() as u64;
self.uncompressed_bytes += uncompressed_len;
self.uncompressed_bytes += img.len() as u64;
let (_img, res) = self
.blob_writer
.write_blob_maybe_compressed(img, ctx, compression)
.await;
// TODO: re-use the buffer for `img` further upstack
let (off, compression_info) = res?;
if compression_info.compressed_size.is_some() {
// The image has been considered for compression at least
self.uncompressed_bytes_eligible += uncompressed_len;
}
if compression_info.written_compressed {
// The image has been compressed
self.uncompressed_bytes_chosen += uncompressed_len;
}
let off = res?;
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
@@ -856,9 +839,6 @@ impl ImageLayerWriterInner {
// Calculate compression ratio
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
.inc_by(self.uncompressed_bytes_eligible);
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
let mut file = self.blob_writer.into_inner();

View File

@@ -24,8 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
use super::image_layer::{self};
use super::{
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
ValuesReconstructState,
PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
};
use utils::generation::Generation;
@@ -247,7 +246,7 @@ impl Layer {
&timeline.generation,
);
LayerInner::new(
let layer = LayerInner::new(
conf,
timeline,
local_path,
@@ -255,7 +254,14 @@ impl Layer {
Some(inner),
timeline.generation,
timeline.get_shard_index(),
)
);
// Newly created layers are marked visible by default: the usual case is that they were created to be read.
layer
.access_stats
.set_visibility(super::LayerVisibilityHint::Visible);
layer
}));
let downloaded = resident.expect("just initialized");
@@ -487,32 +493,6 @@ impl Layer {
}
}
}
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
let old_visibility = self.access_stats().set_visibility(visibility.clone());
use LayerVisibilityHint::*;
match (old_visibility, visibility) {
(Visible, Covered) => {
// Subtract this layer's contribution to the visible size metric
if let Some(tl) = self.0.timeline.upgrade() {
tl.metrics
.visible_physical_size_gauge
.sub(self.0.desc.file_size)
}
}
(Covered, Visible) => {
// Add this layer's contribution to the visible size metric
if let Some(tl) = self.0.timeline.upgrade() {
tl.metrics
.visible_physical_size_gauge
.add(self.0.desc.file_size)
}
}
(Covered, Covered) | (Visible, Visible) => {
// no change
}
}
}
}
/// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
@@ -713,13 +693,6 @@ impl Drop for LayerInner {
timeline.metrics.layer_count_image.dec();
timeline.metrics.layer_size_image.sub(self.desc.file_size);
}
if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
timeline
.metrics
.visible_physical_size_gauge
.sub(self.desc.file_size);
}
}
if !*self.wanted_deleted.get_mut() {
@@ -828,12 +801,6 @@ impl LayerInner {
timeline.metrics.layer_size_image.add(desc.file_size);
}
// New layers are visible by default. This metric is later updated on drop or in set_visibility
timeline
.metrics
.visible_physical_size_gauge
.add(desc.file_size);
LayerInner {
conf,
debug_str: {
@@ -1684,9 +1651,8 @@ impl Drop for DownloadedLayer {
}
impl DownloadedLayer {
/// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`].
/// Failure to load the layer is sticky, i.e., future `get()` calls will return
/// the initial load failure immediately.
/// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to
/// initialize it permanently.
///
/// `owner` parameter is a strong reference at the same `LayerInner` as the
/// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
@@ -1717,7 +1683,7 @@ impl DownloadedLayer {
ctx,
)
.await
.map(LayerKind::Delta)
.map(|res| res.map(LayerKind::Delta))
} else {
let lsn = owner.desc.image_layer_lsn();
let summary = Some(image_layer::Summary::expected(
@@ -1734,29 +1700,32 @@ impl DownloadedLayer {
ctx,
)
.await
.map(LayerKind::Image)
.map(|res| res.map(LayerKind::Image))
};
match res {
Ok(layer) => Ok(layer),
Err(err) => {
Ok(Ok(layer)) => Ok(Ok(layer)),
Ok(Err(transient)) => Err(transient),
Err(permanent) => {
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
// We log this message once over the lifetime of `Self`
// => Ok and good to log backtrace and path here.
tracing::error!(
"layer load failed, assuming permanent failure: {}: {err:?}",
owner.path
);
Err(err)
// TODO(#5815): we are not logging all errors, so temporarily log them **once**
// here as well
let permanent = permanent.context("load layer");
tracing::error!("layer loading failed permanently: {permanent:#}");
Ok(Err(permanent))
}
}
};
self.kind
.get_or_init(init)
.await
.get_or_try_init(init)
// return transient errors using `?`
.await?
.as_ref()
// We already logged the full backtrace above, once. Don't repeat that here.
.map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
.map_err(|e| {
// errors are not clonabled, cannot but stringify
// test_broken_timeline matches this string
anyhow::anyhow!("layer loading failed: {e:#}")
})
}
async fn get_value_reconstruct_data(
@@ -1791,11 +1760,7 @@ impl DownloadedLayer {
) -> Result<(), GetVectoredError> {
use LayerKind::*;
match self
.get(owner, ctx)
.await
.map_err(GetVectoredError::Other)?
{
match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
Delta(d) => {
d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
.await

View File

@@ -828,9 +828,9 @@ async fn eviction_cancellation_on_drop() {
#[test]
#[cfg(target_arch = "x86_64")]
fn layer_size() {
assert_eq!(size_of::<LayerAccessStats>(), 8);
assert_eq!(size_of::<PersistentLayerDesc>(), 104);
assert_eq!(size_of::<LayerInner>(), 312);
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 8);
assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
assert_eq!(std::mem::size_of::<LayerInner>(), 312);
// it also has the utf8 path
}

View File

@@ -41,20 +41,6 @@ pub struct PersistentLayerKey {
pub is_delta: bool,
}
impl std::fmt::Display for PersistentLayerKey {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}..{} {}..{} is_delta={}",
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end,
self.is_delta
)
}
}
impl PersistentLayerDesc {
pub fn key(&self) -> PersistentLayerKey {
PersistentLayerKey {

View File

@@ -204,11 +204,9 @@ impl<'a> IteratorWrapper<'a> {
/// A merge iterator over delta/image layer iterators. When duplicated records are
/// found, the iterator will not perform any deduplication, and the caller should handle
/// these situation. By saying duplicated records, there are many possibilities:
///
/// * Two same delta at the same LSN.
/// * Two same image at the same LSN.
/// * Delta/image at the same LSN where the image has already applied the delta.
///
/// The iterator will always put the image before the delta.
pub struct MergeIterator<'a> {
heap: BinaryHeap<IteratorWrapper<'a>>,

View File

@@ -129,9 +129,11 @@ pub fn start_background_loops(
let background_jobs_can_start = background_jobs_can_start.cloned();
async move {
let cancel = task_mgr::shutdown_token();
let can_start = completion::Barrier::maybe_wait(background_jobs_can_start);
let can_start = tenant.ongoing_timeline_detach.gc_sleeping_while(can_start);
tokio::select! {
_ = cancel.cancelled() => { return Ok(()) },
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
_ = can_start => {}
};
gc_loop(tenant, cancel)
.instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
@@ -210,28 +212,24 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
Duration::from_secs(10)
} else {
// Run compaction
match tenant.compaction_iteration(&cancel, &ctx).await {
Err(e) => {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
log_compaction_error(
&e,
error_run_count,
&wait_duration,
cancel.is_cancelled(),
);
wait_duration
}
Ok(has_pending_task) => {
error_run_count = 0;
// schedule the next compaction immediately in case there is a pending compaction task
if has_pending_task { Duration::from_secs(0) } else { period }
}
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
log_compaction_error(
&e,
error_run_count,
&wait_duration,
cancel.is_cancelled(),
);
wait_duration
} else {
error_run_count = 0;
period
}
};
@@ -365,14 +363,13 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
if first {
first = false;
if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
.await
.is_err()
{
break;
}
let delays = async {
delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
random_init_delay(period, &cancel).await?;
Ok::<_, Cancelled>(())
};
if random_init_delay(period, &cancel).await.is_err() {
if tenant.ongoing_timeline_detach.gc_sleeping_while(delays).await.is_err() {
break;
}
}
@@ -408,8 +405,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let wait_duration = Duration::from_secs_f64(wait_duration);
error!(
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
);
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
);
wait_duration
}
}
@@ -418,7 +415,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
let cancelled = cancel.cancelled();
let cancelled = tenant.ongoing_timeline_detach.gc_sleeping_while(cancelled);
if tokio::time::timeout(sleep_duration, cancelled)
.await
.is_ok()
{

View File

@@ -3,7 +3,6 @@ pub(crate) mod compaction;
pub mod delete;
pub(crate) mod detach_ancestor;
mod eviction_task;
pub(crate) mod handle;
mod init;
pub mod layer_manager;
pub(crate) mod logical_size;
@@ -18,7 +17,6 @@ use camino::Utf8Path;
use chrono::{DateTime, Utc};
use enumset::EnumSet;
use fail::fail_point;
use handle::ShardTimelineId;
use once_cell::sync::Lazy;
use pageserver_api::{
key::{
@@ -60,7 +58,7 @@ use std::{
sync::atomic::AtomicU64,
};
use std::{
cmp::{max, min},
cmp::{max, min, Ordering},
ops::ControlFlow,
};
use std::{
@@ -76,7 +74,6 @@ use crate::{
metadata::TimelineMetadata,
storage_layer::PersistentLayerDesc,
},
walredo,
};
use crate::{
context::{DownloadBehavior, RequestContext},
@@ -180,6 +177,25 @@ impl std::fmt::Display for ImageLayerCreationMode {
}
}
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct Hole {
key_range: Range<Key>,
coverage_size: usize,
}
impl Ord for Hole {
fn cmp(&self, other: &Self) -> Ordering {
other.coverage_size.cmp(&self.coverage_size) // inverse order
}
}
impl PartialOrd for Hole {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
/// Can be removed after all refactors are done.
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
@@ -427,8 +443,6 @@ pub struct Timeline {
pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
pub(crate) l0_flush_global_state: L0FlushGlobalState,
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
}
pub struct WalReceiverInfo {
@@ -534,6 +548,7 @@ impl GetVectoredError {
}
}
#[derive(Debug)]
pub struct MissingKeyError {
key: Key,
shard: ShardNumber,
@@ -544,12 +559,6 @@ pub struct MissingKeyError {
backtrace: Option<std::backtrace::Backtrace>,
}
impl std::fmt::Debug for MissingKeyError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self)
}
}
impl std::fmt::Display for MissingKeyError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
@@ -1001,10 +1010,7 @@ impl Timeline {
.for_get_kind(GetKind::Singular)
.observe(elapsed.as_secs_f64());
if cfg!(feature = "testing")
&& res.is_err()
&& !matches!(res, Err(PageReconstructError::Cancelled))
{
if cfg!(feature = "testing") && res.is_err() {
// it can only be walredo issue
use std::fmt::Write;
@@ -1763,14 +1769,13 @@ impl Timeline {
}
}
/// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
/// compaction tasks.
/// Outermost timeline compaction operation; downloads needed layers.
pub(crate) async fn compact(
self: &Arc<Self>,
cancel: &CancellationToken,
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> Result<bool, CompactionError> {
) -> Result<(), CompactionError> {
// most likely the cancellation token is from background task, but in tests it could be the
// request task as well.
@@ -1790,8 +1795,8 @@ impl Timeline {
// compaction task goes over it's period (20s) which is quite often in production.
let (_guard, _permit) = tokio::select! {
tuple = prepare => { tuple },
_ = self.cancel.cancelled() => return Ok(false),
_ = cancel.cancelled() => return Ok(false),
_ = self.cancel.cancelled() => return Ok(()),
_ = cancel.cancelled() => return Ok(()),
};
let last_record_lsn = self.get_last_record_lsn();
@@ -1799,14 +1804,11 @@ impl Timeline {
// Last record Lsn could be zero in case the timeline was just created
if !last_record_lsn.is_valid() {
warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
return Ok(false);
return Ok(());
}
match self.get_compaction_algorithm_settings().kind {
CompactionAlgorithm::Tiered => {
self.compact_tiered(cancel, ctx).await?;
Ok(false)
}
CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
}
}
@@ -1923,9 +1925,6 @@ impl Timeline {
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
// Ensure Prevent new page service requests from starting.
self.handles.shutdown();
// Transition the remote_client into a state where it's only useful for timeline deletion.
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
self.remote_client.stop();
@@ -1998,11 +1997,6 @@ impl Timeline {
self.current_state() == TimelineState::Active
}
#[allow(unused)]
pub(crate) fn is_archived(&self) -> Option<bool> {
self.remote_client.is_archived()
}
pub(crate) fn is_stopping(&self) -> bool {
self.current_state() == TimelineState::Stopping
}
@@ -2451,8 +2445,6 @@ impl Timeline {
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
l0_flush_global_state: resources.l0_flush_global_state,
handles: Default::default(),
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2736,10 +2728,6 @@ impl Timeline {
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
// Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
drop(guard); // drop write lock, update_layer_visibility will take a read lock.
self.update_layer_visibility().await;
info!(
"loaded layer map with {} layers at {}, total physical size: {}",
num_layers, disk_consistent_lsn, total_physical_size
@@ -3433,6 +3421,7 @@ impl Timeline {
}
}
#[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
#[allow(clippy::doc_lazy_continuation)]
/// Get the data needed to reconstruct all keys in the provided keyspace
///
@@ -3726,17 +3715,6 @@ impl Timeline {
&self.shard_identity
}
#[inline(always)]
pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
ShardTimelineId {
shard_index: ShardIndex {
shard_number: self.shard_identity.number,
shard_count: self.shard_identity.count,
},
timeline_id: self.timeline_id,
}
}
///
/// Get a handle to the latest layer for appending.
///
@@ -4681,6 +4659,27 @@ impl Timeline {
}
}
// The writer.finish() above already did the fsync of the inodes.
// We just need to fsync the directory in which these inodes are linked,
// which we know to be the timeline directory.
if !image_layers.is_empty() {
// We use fatal_err() below because the after writer.finish() returns with success,
// the in-memory state of the filesystem already has the layer file in its final place,
// and subsequent pageserver code could think it's durable while it really isn't.
let timeline_dir = VirtualFile::open(
&self
.conf
.timeline_path(&self.tenant_shard_id, &self.timeline_id),
ctx,
)
.await
.fatal_err("VirtualFile::open for timeline dir fsync");
timeline_dir
.sync_all()
.await
.fatal_err("VirtualFile::sync_all timeline dir");
}
let mut guard = self.layers.write().await;
// FIXME: we could add the images to be uploaded *before* returning from here, but right
@@ -4689,9 +4688,6 @@ impl Timeline {
drop_wlock(guard);
timer.stop_and_record();
// Creating image layers may have caused some previously visible layers to be covered
self.update_layer_visibility().await;
Ok(image_layers)
}
@@ -5456,22 +5452,20 @@ impl Timeline {
} else {
trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
};
let res = self
let img = match self
.walredo_mgr
.as_ref()
.context("timeline has no walredo manager")
.map_err(PageReconstructError::WalRedo)?
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
.await;
let img = match res {
.await
.context("reconstruct a page image")
{
Ok(img) => img,
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
Err(walredo::Error::Other(e)) => {
return Err(PageReconstructError::WalRedo(
e.context("reconstruct a page image"),
))
}
Err(e) => return Err(PageReconstructError::WalRedo(e)),
};
Ok(img)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -206,10 +206,11 @@ impl DeleteTimelineFlow {
// NB: If this fails half-way through, and is retried, the retry will go through
// all the same steps again. Make sure the code here is idempotent, and don't
// error out if some of the shutdown tasks have already been completed!
#[instrument(skip_all)]
#[instrument(skip_all, fields(%inplace))]
pub async fn run(
tenant: &Arc<Tenant>,
timeline_id: TimelineId,
inplace: bool,
) -> Result<(), DeleteTimelineError> {
super::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -234,7 +235,11 @@ impl DeleteTimelineFlow {
))?
});
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
if inplace {
Self::background(guard, tenant.conf, tenant, &timeline).await?
} else {
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
}
Ok(())
}

View File

@@ -92,7 +92,7 @@ impl From<FlushLayerError> for Error {
}
pub(crate) enum Progress {
Prepared(completion::Completion, PreparedTimelineDetach),
Prepared(Attempt, PreparedTimelineDetach),
Done(AncestorDetached),
}
@@ -116,6 +116,130 @@ impl Default for Options {
}
}
/// SharedState manages the pausing of background tasks (GC) for the duration of timeline detach
/// ancestor.
///
/// Currently this is tracked at tenant level, but it could be moved to be on the roots
/// of each timeline tree.
#[derive(Default)]
pub(crate) struct SharedState {
inner: std::sync::Mutex<Option<(TimelineId, completion::Barrier)>>,
}
impl SharedState {
/// Notify an uninitialized shared state that an attempt to detach timeline ancestor continues
/// from previous instance.
pub(crate) fn continue_existing_attempt(&self, _attempt: &Attempt) {}
/// Only GC must be paused while a detach ancestor is ongoing. Compaction can happen, to aid
/// with any ongoing ingestion. Compaction even after restart is ok because layers will not be
/// removed until the detach has been persistently completed.
pub(crate) fn attempt_blocks_gc(&self) -> bool {
// if we have any started and not finished ancestor detaches, we must remain paused
// and also let any trying to start operation know that we've paused.
// Two cases:
// - there is an actual attempt started
// - we have learned from indexparts that an attempt will be retried in near future
self.inner.lock().unwrap().is_some()
}
/// Sleep for the duration, while letting any ongoing ancestor_detach attempt know that gc has
/// been paused.
///
/// Cancellation safe.
pub(crate) async fn gc_sleeping_while<T, F: std::future::Future<Output = T>>(
&self,
fut: F,
) -> T {
// this needs to wrap the sleeping so that we can quickly let ancestor_detach continue
fut.await
}
/// Acquire the exclusive lock for a new detach ancestor attempt and ensure that GC task has
/// been persistently paused via [`crate::tenant::IndexPart`], awaiting for completion.
///
/// Cancellation safe.
async fn start_new_attempt(&self, detached: &Arc<Timeline>) -> Result<Attempt, Error> {
if detached.cancel.is_cancelled() {
return Err(Error::ShuttingDown);
}
let (guard, barrier) = completion::channel();
{
let mut guard = self.inner.lock().unwrap();
if let Some((tl, other)) = guard.as_ref() {
if !other.is_ready() {
return Err(Error::OtherTimelineDetachOngoing(*tl));
}
}
*guard = Some((detached.timeline_id, barrier));
}
// FIXME: modify the index part to have a "detach-ancestor: inprogress { started_at }"
// unsure if it should be awaited to upload yet...
// finally
let gate_entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?;
Ok(Attempt {
_guard: guard,
gate_entered: Some(gate_entered),
})
}
/// Completes a previously started detach ancestor attempt. To be called *after* the operation
/// including the tenant has been restarted. The completion is persistent, and no reparentings
/// can be done afterwards.
///
/// Cancellation safe.
pub(crate) async fn complete(
&self,
_attempt: Attempt,
_tenant: &Arc<Tenant>,
) -> Result<(), Error> {
// do we need the tenant to actually activate...? yes we do, but that can happen via
// background task starting, because we will similarly want to confirm that the gc has
// paused, before we unpause it?
//
// assert that such and such state has been collected
// find the timeline the attempt represents
// using the timelines remote client, upload an index part with completion information
Ok(())
}
}
/// Token which represents a persistent, exclusive, awaitable single attempt.
pub(crate) struct Attempt {
_guard: completion::Completion,
gate_entered: Option<utils::sync::gate::GateGuard>,
}
impl Attempt {
pub(crate) fn before_shutdown(&mut self) {
let taken = self.gate_entered.take();
assert!(taken.is_some());
}
}
#[derive(Default)]
pub(crate) struct SharedStateBuilder {}
impl SharedStateBuilder {
/// While loading, visit a timelines persistent [`crate::tenant::IndexPart`] and record if it is being
/// detached.
pub(crate) fn record_loading_timeline(
&mut self,
_timeline_id: &TimelineId,
_index_part: &crate::tenant::IndexPart,
) {
}
/// Merge the loaded not yet deleting in-progress to the existing datastructure.
pub(crate) fn build(self, _target: &SharedState) {}
}
/// See [`Timeline::prepare_to_detach_from_ancestor`]
pub(super) async fn prepare(
detached: &Arc<Timeline>,
@@ -136,7 +260,7 @@ pub(super) async fn prepare(
// we are safe to inspect the latest uploaded, because we can only witness this after
// restart is complete and ancestor is no more.
let latest = accessor.latest_uploaded_index_part();
if !latest.lineage.is_detached_from_original_ancestor() {
if !latest.lineage.is_detached_from_ancestor() {
return Err(NoAncestor);
}
}
@@ -150,6 +274,11 @@ pub(super) async fn prepare(
.values()
.filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
.map(|tl| (tl.ancestor_lsn, tl.clone()))
// FIXME: instead of collecting, partition based on if they are still reparentable
// if gc has been blocked, we can still reparent these timelines
//
// Collect to avoid lock taking order problem with Tenant::timelines and
// Timeline::remote_client
.collect::<Vec<_>>();
let mut any_shutdown = false;
@@ -174,12 +303,19 @@ pub(super) async fn prepare(
return Err(Error::ShuttingDown);
}
// FIXME: reparent the reparentable (return Progress::Prepared if there were any) -- we
// will need to acquire the lock as well..? so it would make sense that at load time we
// would detect the in-progress ness and "soft-acquire it for us".
// FIXME: otherwise release gc blocking if it still is there, and wait for upload
let mut reparented = all_direct_children;
// why this instead of hashset? there is a reason, but I've forgotten it many times.
//
// maybe if this was a hashset we would not be able to distinguish some race condition.
reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
// FIXME: add the non-reparented in to the response -- these would be the reparentable, but
// no longer reparentable because they appeared *after* gc blocking was released.
return Ok(Progress::Done(AncestorDetached {
reparented_timelines: reparented
.into_iter()
@@ -200,22 +336,10 @@ pub(super) async fn prepare(
return Err(TooManyAncestors);
}
// before we acquire the gate, we must mark the ancestor as having a detach operation
// ongoing which will block other concurrent detach operations so we don't get to ackward
// situations where there would be two branches trying to reparent earlier branches.
let (guard, barrier) = completion::channel();
{
let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
if let Some((tl, other)) = guard.as_ref() {
if !other.is_ready() {
return Err(OtherTimelineDetachOngoing(*tl));
}
}
*guard = Some((detached.timeline_id, barrier));
}
let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
let attempt = tenant
.ongoing_timeline_detach
.start_new_attempt(detached)
.await?;
utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
@@ -304,20 +428,26 @@ pub(super) async fn prepare(
let timeline = detached.clone();
let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download);
tasks.spawn(async move {
let _permit = limiter.acquire().await;
let copied =
upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
.await?;
Ok(copied)
});
let span = tracing::info_span!("upload_rewritten_layer", %layer);
tasks.spawn(
async move {
let _permit = limiter.acquire().await;
let copied =
upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
.await?;
if let Some(copied) = copied.as_ref() {
tracing::info!(%copied, "rewrote and uploaded");
}
Ok(copied)
}
.instrument(span),
);
}
while let Some(res) = tasks.join_next().await {
match res {
Ok(Ok(Some(copied))) => {
wrote_any = true;
tracing::info!(layer=%copied, "rewrote and uploaded");
new_layers.push(copied);
}
Ok(Ok(None)) => {}
@@ -378,7 +508,7 @@ pub(super) async fn prepare(
let prepared = PreparedTimelineDetach { layers: new_layers };
Ok(Progress::Prepared(guard, prepared))
Ok(Progress::Prepared(attempt, prepared))
}
fn partition_work(
@@ -562,6 +692,9 @@ pub(super) async fn complete(
)
.await?;
// FIXME: assert that the persistent record of inprogress detach exists
// FIXME: assert that gc is still blocked
let mut tasks = tokio::task::JoinSet::new();
// because we are now keeping the slot in progress, it is unlikely that there will be any
@@ -647,6 +780,7 @@ pub(super) async fn complete(
}
if reparenting_candidates != reparented.len() {
// FIXME: we must return 503 kind of response
tracing::info!("failed to reparent some candidates");
}
@@ -657,5 +791,8 @@ pub(super) async fn complete(
.map(|(_, timeline_id)| timeline_id)
.collect();
// FIXME: here everything has gone peachy, the tenant will be restarted next.
// after restart and before returning the response, the gc blocking must be undone
Ok(reparented)
}

View File

@@ -1,967 +0,0 @@
//! An efficient way to keep the timeline gate open without preventing
//! timeline shutdown for longer than a single call to a timeline method.
//!
//! # Motivation
//!
//! On a single page service connection, we're typically serving a single TenantTimelineId.
//!
//! Without sharding, there is a single Timeline object to which we dispatch
//! all requests. For example, a getpage request gets dispatched to the
//! Timeline::get method of the Timeline object that represents the
//! (tenant,timeline) of that connection.
//!
//! With sharding, for each request that comes in on the connection,
//! we first have to perform shard routing based on the requested key (=~ page number).
//! The result of shard routing is a Timeline object.
//! We then dispatch the request to that Timeline object.
//!
//! Regardless of whether the tenant is sharded or not, we want to ensure that
//! we hold the Timeline gate open while we're invoking the method on the
//! Timeline object.
//!
//! However, we want to avoid the overhead of entering the gate for every
//! method invocation.
//!
//! Further, for shard routing, we want to avoid calling the tenant manager to
//! resolve the shard for every request. Instead, we want to cache the
//! routing result so we can bypass the tenant manager for all subsequent requests
//! that get routed to that shard.
//!
//! Regardless of how we accomplish the above, it should not
//! prevent the Timeline from shutting down promptly.
//!
//! # Design
//!
//! There are three user-facing data structures:
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
//!
//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
//!
//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
//!
//! To dispatch a request, the page service connection calls `Cache::get`.
//!
//! A cache miss means we consult the tenant manager for shard routing,
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
//! and the `Arc<HandleInner>` in the `PerTimelineState`.
//!
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
//! and find the `Weak<HandleInner>` in the cache.
//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
//!
//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
//!
//! # Memory Management / How The Reference Cycle Is Broken
//!
//! The attentive reader may have noticed the strong reference cycle
//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
//!
//! This cycle is intentional: while it exists, the `Cache` can upgrade its
//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
//!
//! The cycle is broken by either
//! - `PerTimelineState::shutdown` or
//! - dropping the `Cache`.
//!
//! Concurrently existing `Handle`s will extend the existence of the cycle.
//! However, since `Handle`s are short-lived and new `Handle`s are not
//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
//! that extension of the cycle is bounded.
//!
//! # Fast Path for Shard Routing
//!
//! The `Cache` has a fast path for shard routing to avoid calling into
//! the tenant manager for every request.
//!
//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
//!
//! The current implementation uses the first entry in the hash map
//! to determine the `ShardParameters` and derive the correct
//! `ShardIndex` for the requested key.
//!
//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
//!
//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
//! it's a hit.
//!
//! ## Cache invalidation
//!
//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
//! The only reasons why an entry in the cache can become stale are:
//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
//! being detached, timeline or shard deleted, or pageserver is shutting down.
//! 2. We're doing a shard split and new traffic should be routed to the child shards.
//!
//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
//! timeline has shut down, and when that happens, we remove the entry from the cache.
//!
//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
//! to the parent shard during a shard split. Eventually, the shard split task will
//! shut down the parent => case (1).
use std::collections::hash_map;
use std::collections::HashMap;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::sync::Mutex;
use std::sync::Weak;
use pageserver_api::shard::ShardIdentity;
use tracing::instrument;
use tracing::trace;
use utils::id::TimelineId;
use utils::shard::ShardIndex;
use utils::shard::ShardNumber;
use crate::tenant::mgr::ShardSelector;
/// The requirement for Debug is so that #[derive(Debug)] works in some places.
pub(crate) trait Types: Sized + std::fmt::Debug {
type TenantManagerError: Sized + std::fmt::Debug;
type TenantManager: TenantManager<Self> + Sized;
type Timeline: ArcTimeline<Self> + Sized;
}
/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
struct CacheId(u64);
impl CacheId {
fn next() -> Self {
static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
if id == 0 {
panic!("CacheId::new() returned 0, overflow");
}
Self(id)
}
}
/// See module-level comment.
pub(crate) struct Cache<T: Types> {
id: CacheId,
map: Map<T>,
}
type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
impl<T: Types> Default for Cache<T> {
fn default() -> Self {
Self {
id: CacheId::next(),
map: Default::default(),
}
}
}
#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
pub(crate) struct ShardTimelineId {
pub(crate) shard_index: ShardIndex,
pub(crate) timeline_id: TimelineId,
}
/// See module-level comment.
pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
struct HandleInner<T: Types> {
shut_down: AtomicBool,
timeline: T::Timeline,
// The timeline's gate held open.
_gate_guard: utils::sync::gate::GateGuard,
}
/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
///
/// See module-level comment for details.
pub struct PerTimelineState<T: Types> {
// None = shutting down
handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
}
impl<T: Types> Default for PerTimelineState<T> {
fn default() -> Self {
Self {
handles: Mutex::new(Some(Default::default())),
}
}
}
/// Abstract view of [`crate::tenant::mgr`], for testability.
pub(crate) trait TenantManager<T: Types> {
/// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
/// Errors are returned as [`GetError::TenantManager`].
async fn resolve(
&self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
) -> Result<T::Timeline, T::TenantManagerError>;
}
/// Abstract view of an [`Arc<Timeline>`], for testability.
pub(crate) trait ArcTimeline<T: Types>: Clone {
fn gate(&self) -> &utils::sync::gate::Gate;
fn shard_timeline_id(&self) -> ShardTimelineId;
fn get_shard_identity(&self) -> &ShardIdentity;
fn per_timeline_state(&self) -> &PerTimelineState<T>;
}
/// Errors returned by [`Cache::get`].
#[derive(Debug)]
pub(crate) enum GetError<T: Types> {
TenantManager(T::TenantManagerError),
TimelineGateClosed,
PerTimelineStateShutDown,
}
/// Internal type used in [`Cache::get`].
enum RoutingResult<T: Types> {
FastPath(Handle<T>),
SlowPath(ShardTimelineId),
NeedConsultTenantManager,
}
impl<T: Types> Cache<T> {
/// See module-level comment for details.
///
/// Does NOT check for the shutdown state of [`Types::Timeline`].
/// Instead, the methods of [`Types::Timeline`] that are invoked through
/// the [`Handle`] are responsible for checking these conditions
/// and if so, return an error that causes the page service to
/// close the connection.
#[instrument(level = "trace", skip_all)]
pub(crate) async fn get(
&mut self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
tenant_manager: &T::TenantManager,
) -> Result<Handle<T>, GetError<T>> {
// terminates because each iteration removes an element from the map
loop {
let handle = self
.get_impl(timeline_id, shard_selector, tenant_manager)
.await?;
if handle.0.shut_down.load(Ordering::Relaxed) {
let removed = self
.map
.remove(&handle.0.timeline.shard_timeline_id())
.expect("invariant of get_impl is that the returned handle is in the map");
assert!(
Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
"shard_timeline_id() incorrect?"
);
} else {
return Ok(handle);
}
}
}
#[instrument(level = "trace", skip_all)]
async fn get_impl(
&mut self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
tenant_manager: &T::TenantManager,
) -> Result<Handle<T>, GetError<T>> {
let miss: ShardSelector = {
let routing_state = self.shard_routing(timeline_id, shard_selector);
match routing_state {
RoutingResult::FastPath(handle) => return Ok(handle),
RoutingResult::SlowPath(key) => match self.map.get(&key) {
Some(cached) => match cached.upgrade() {
Some(upgraded) => return Ok(Handle(upgraded)),
None => {
trace!("handle cache stale");
self.map.remove(&key).unwrap();
ShardSelector::Known(key.shard_index)
}
},
None => ShardSelector::Known(key.shard_index),
},
RoutingResult::NeedConsultTenantManager => shard_selector,
}
};
self.get_miss(timeline_id, miss, tenant_manager).await
}
#[inline(always)]
fn shard_routing(
&mut self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
) -> RoutingResult<T> {
loop {
// terminates because when every iteration we remove an element from the map
let Some((first_key, first_handle)) = self.map.iter().next() else {
return RoutingResult::NeedConsultTenantManager;
};
let Some(first_handle) = first_handle.upgrade() else {
// TODO: dedup with get()
trace!("handle cache stale");
let first_key_owned = *first_key;
self.map.remove(&first_key_owned).unwrap();
continue;
};
let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
let make_shard_index = |shard_num: ShardNumber| ShardIndex {
shard_number: shard_num,
shard_count: first_handle_shard_identity.count,
};
let need_idx = match shard_selector {
ShardSelector::Page(key) => {
make_shard_index(first_handle_shard_identity.get_shard_number(&key))
}
ShardSelector::Zero => make_shard_index(ShardNumber(0)),
ShardSelector::Known(shard_idx) => shard_idx,
};
let need_shard_timeline_id = ShardTimelineId {
shard_index: need_idx,
timeline_id,
};
let first_handle_shard_timeline_id = ShardTimelineId {
shard_index: first_handle_shard_identity.shard_index(),
timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
};
if need_shard_timeline_id == first_handle_shard_timeline_id {
return RoutingResult::FastPath(Handle(first_handle));
} else {
return RoutingResult::SlowPath(need_shard_timeline_id);
}
}
}
#[instrument(level = "trace", skip_all)]
#[inline(always)]
async fn get_miss(
&mut self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
tenant_manager: &T::TenantManager,
) -> Result<Handle<T>, GetError<T>> {
match tenant_manager.resolve(timeline_id, shard_selector).await {
Ok(timeline) => {
let key = timeline.shard_timeline_id();
match &shard_selector {
ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
ShardSelector::Page(_) => (), // gotta trust tenant_manager
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
}
let gate_guard = match timeline.gate().enter() {
Ok(guard) => guard,
Err(_) => {
return Err(GetError::TimelineGateClosed);
}
};
trace!("creating new HandleInner");
let handle = Arc::new(
// TODO: global metric that keeps track of the number of live HandlerTimeline instances
// so we can identify reference cycle bugs.
HandleInner {
shut_down: AtomicBool::new(false),
_gate_guard: gate_guard,
timeline: timeline.clone(),
},
);
let handle = {
let mut lock_guard = timeline
.per_timeline_state()
.handles
.lock()
.expect("mutex poisoned");
match &mut *lock_guard {
Some(per_timeline_state) => {
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
assert!(replaced.is_none(), "some earlier code left a stale handle");
match self.map.entry(key) {
hash_map::Entry::Occupied(_o) => {
// This cannot not happen because
// 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
// 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
// while we were waiting for the tenant manager.
unreachable!()
}
hash_map::Entry::Vacant(v) => {
v.insert(Arc::downgrade(&handle));
handle
}
}
}
None => {
return Err(GetError::PerTimelineStateShutDown);
}
}
};
Ok(Handle(handle))
}
Err(e) => Err(GetError::TenantManager(e)),
}
}
}
impl<T: Types> PerTimelineState<T> {
/// After this method returns, [`Cache::get`] will never again return a [`Handle`]
/// to the [`Types::Timeline`] that embeds this per-timeline state.
/// Even if [`TenantManager::resolve`] would still resolve to it.
///
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
/// That's ok because they're short-lived. See module-level comment for details.
#[instrument(level = "trace", skip_all)]
pub(super) fn shutdown(&self) {
let handles = self
.handles
.lock()
.expect("mutex poisoned")
// NB: this .take() sets locked to None.
// That's what makes future `Cache::get` misses fail.
// Cache hits are taken care of below.
.take();
let Some(handles) = handles else {
trace!("already shut down");
return;
};
for handle in handles.values() {
// Make hits fail.
handle.shut_down.store(true, Ordering::Relaxed);
}
drop(handles);
}
}
impl<T: Types> std::ops::Deref for Handle<T> {
type Target = T::Timeline;
fn deref(&self) -> &Self::Target {
&self.0.timeline
}
}
#[cfg(test)]
impl<T: Types> Drop for HandleInner<T> {
fn drop(&mut self) {
trace!("HandleInner dropped");
}
}
// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
impl<T: Types> Drop for Cache<T> {
fn drop(&mut self) {
for (_, weak) in self.map.drain() {
if let Some(strong) = weak.upgrade() {
// handle is still being kept alive in PerTimelineState
let timeline = strong.timeline.per_timeline_state();
let mut handles = timeline.handles.lock().expect("mutex poisoned");
if let Some(handles) = &mut *handles {
let Some(removed) = handles.remove(&self.id) else {
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
continue;
};
assert!(Arc::ptr_eq(&removed, &strong));
}
}
}
}
}
#[cfg(test)]
mod tests {
use pageserver_api::{
key::{rel_block_to_key, Key, DBDIR_KEY},
models::ShardParameters,
reltag::RelTag,
shard::ShardStripeSize,
};
use utils::shard::ShardCount;
use super::*;
const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
#[derive(Debug)]
struct TestTypes;
impl Types for TestTypes {
type TenantManagerError = anyhow::Error;
type TenantManager = StubManager;
type Timeline = Arc<StubTimeline>;
}
struct StubManager {
shards: Vec<Arc<StubTimeline>>,
}
struct StubTimeline {
gate: utils::sync::gate::Gate,
id: TimelineId,
shard: ShardIdentity,
per_timeline_state: PerTimelineState<TestTypes>,
myself: Weak<StubTimeline>,
}
impl StubTimeline {
fn getpage(&self) {
// do nothing
}
}
impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
fn gate(&self) -> &utils::sync::gate::Gate {
&self.gate
}
fn shard_timeline_id(&self) -> ShardTimelineId {
ShardTimelineId {
shard_index: self.shard.shard_index(),
timeline_id: self.id,
}
}
fn get_shard_identity(&self) -> &ShardIdentity {
&self.shard
}
fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
&self.per_timeline_state
}
}
impl TenantManager<TestTypes> for StubManager {
async fn resolve(
&self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
) -> anyhow::Result<Arc<StubTimeline>> {
for timeline in &self.shards {
if timeline.id == timeline_id {
match &shard_selector {
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
return Ok(Arc::clone(timeline));
}
ShardSelector::Zero => continue,
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
return Ok(Arc::clone(timeline));
}
ShardSelector::Page(_) => continue,
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
return Ok(Arc::clone(timeline));
}
ShardSelector::Known(_) => continue,
}
}
}
anyhow::bail!("not found")
}
}
#[tokio::test(start_paused = true)]
async fn test_timeline_shutdown() {
crate::tenant::harness::setup_logging();
let timeline_id = TimelineId::generate();
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let mgr = StubManager {
shards: vec![shard0.clone()],
};
let key = DBDIR_KEY;
let mut cache = Cache::<TestTypes>::default();
//
// fill the cache
//
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(2, 1),
"strong: shard0, mgr; weak: myself"
);
let handle: Handle<_> = cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.expect("we have the timeline");
let handle_inner_weak = Arc::downgrade(&handle.0);
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
assert_eq!(
(
Weak::strong_count(&handle_inner_weak),
Weak::weak_count(&handle_inner_weak)
),
(2, 2),
"strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
);
assert_eq!(cache.map.len(), 1);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(3, 1),
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
);
drop(handle);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(3, 1),
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
);
//
// demonstrate that Handle holds up gate closure
// but shutdown prevents new handles from being handed out
//
tokio::select! {
_ = shard0.gate.close() => {
panic!("cache and per-timeline handler state keep cache open");
}
_ = tokio::time::sleep(FOREVER) => {
// NB: first poll of close() makes it enter closing state
}
}
let handle = cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.expect("we have the timeline");
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
// SHUTDOWN
shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
assert_eq!(
1,
Weak::strong_count(&handle_inner_weak),
"through local var handle"
);
assert_eq!(
cache.map.len(),
1,
"this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(3, 1),
"strong: handleinner(via handle), shard0, mgr; weak: myself"
);
// this handle is perfectly usable
handle.getpage();
cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.err()
.expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
assert_eq!(
cache.map.len(),
0,
"first access after shutdown cleans up the Weak's from the cache"
);
tokio::select! {
_ = shard0.gate.close() => {
panic!("handle is keeping gate open");
}
_ = tokio::time::sleep(FOREVER) => { }
}
drop(handle);
assert_eq!(
0,
Weak::strong_count(&handle_inner_weak),
"the HandleInner destructor already ran"
);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(2, 1),
"strong: shard0, mgr; weak: myself"
);
// closing gate succeeds after dropping handle
tokio::select! {
_ = shard0.gate.close() => { }
_ = tokio::time::sleep(FOREVER) => {
panic!("handle is dropped, no other gate holders exist")
}
}
// map gets cleaned on next lookup
cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.err()
.expect("documented behavior: can't get new handle after shutdown");
assert_eq!(cache.map.len(), 0);
// ensure all refs to shard0 are gone and we're not leaking anything
let myself = Weak::clone(&shard0.myself);
drop(shard0);
drop(mgr);
assert_eq!(Weak::strong_count(&myself), 0);
}
#[tokio::test]
async fn test_multiple_timelines_and_deletion() {
crate::tenant::harness::setup_logging();
let timeline_a = TimelineId::generate();
let timeline_b = TimelineId::generate();
assert_ne!(timeline_a, timeline_b);
let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_a,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_b,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let mut mgr = StubManager {
shards: vec![timeline_a.clone(), timeline_b.clone()],
};
let key = DBDIR_KEY;
let mut cache = Cache::<TestTypes>::default();
cache
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
.await
.expect("we have it");
cache
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
.await
.expect("we have it");
assert_eq!(cache.map.len(), 2);
// delete timeline A
timeline_a.per_timeline_state.shutdown();
mgr.shards.retain(|t| t.id != timeline_a.id);
assert!(
mgr.resolve(timeline_a.id, ShardSelector::Page(key))
.await
.is_err(),
"broken StubManager implementation"
);
assert_eq!(
cache.map.len(),
2,
"cache still has a Weak handle to Timeline A"
);
cache
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
.await
.err()
.expect("documented behavior: can't get new handle after shutdown");
assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
cache
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
.await
.expect("we still have it");
}
fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
rel_block_to_key(
RelTag {
spcnode: 1663,
dbnode: 208101,
relnode: 2620,
forknum: 0,
},
shard.0 as u32 * params.stripe_size.0,
)
}
#[tokio::test(start_paused = true)]
async fn test_shard_split() {
crate::tenant::harness::setup_logging();
let timeline_id = TimelineId::generate();
let parent = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let child_params = ShardParameters {
count: ShardCount(2),
stripe_size: ShardStripeSize::default(),
};
let child0 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let child1 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let child_shards_by_shard_number = [child0.clone(), child1.clone()];
let mut cache = Cache::<TestTypes>::default();
// fill the cache with the parent
for i in 0..2 {
let handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
&StubManager {
shards: vec![parent.clone()],
},
)
.await
.expect("we have it");
assert!(
Weak::ptr_eq(&handle.myself, &parent.myself),
"mgr returns parent first"
);
drop(handle);
}
//
// SHARD SPLIT: tenant manager changes, but the cache isn't informed
//
// while we haven't shut down the parent, the cache will return the cached parent, even
// if the tenant manager returns the child
for i in 0..2 {
let handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
&StubManager {
shards: vec![], // doesn't matter what's in here, the cache is fully loaded
},
)
.await
.expect("we have it");
assert!(
Weak::ptr_eq(&handle.myself, &parent.myself),
"mgr returns parent"
);
drop(handle);
}
let parent_handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
&StubManager {
shards: vec![parent.clone()],
},
)
.await
.expect("we have it");
assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
// invalidate the cache
parent.per_timeline_state.shutdown();
// the cache will now return the child, even though the parent handle still exists
for i in 0..2 {
let handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
&StubManager {
shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
},
)
.await
.expect("we have it");
assert!(
Weak::ptr_eq(
&handle.myself,
&child_shards_by_shard_number[i as usize].myself
),
"mgr returns child"
);
drop(handle);
}
// all the while the parent handle kept the parent gate open
tokio::select! {
_ = parent_handle.gate.close() => {
panic!("parent handle is keeping gate open");
}
_ = tokio::time::sleep(FOREVER) => { }
}
drop(parent_handle);
tokio::select! {
_ = parent.gate.close() => { }
_ = tokio::time::sleep(FOREVER) => {
panic!("parent handle is dropped, no other gate holders exist")
}
}
}
#[tokio::test(start_paused = true)]
async fn test_connection_handler_exit() {
crate::tenant::harness::setup_logging();
let timeline_id = TimelineId::generate();
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let mgr = StubManager {
shards: vec![shard0.clone()],
};
let key = DBDIR_KEY;
// Simulate 10 connections that's opened, used, and closed
let mut used_handles = vec![];
for _ in 0..10 {
let mut cache = Cache::<TestTypes>::default();
let handle = {
let handle = cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.expect("we have the timeline");
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
handle
};
handle.getpage();
used_handles.push(Arc::downgrade(&handle.0));
}
// No handles exist, thus gates are closed and don't require shutdown
assert!(used_handles
.iter()
.all(|weak| Weak::strong_count(weak) == 0));
// ... thus the gate should close immediately, even without shutdown
tokio::select! {
_ = shard0.gate.close() => { }
_ = tokio::time::sleep(FOREVER) => {
panic!("handle is dropped, no other gate holders exist")
}
}
}
}

View File

@@ -35,10 +35,6 @@ impl LayerManager {
self.layer_fmgr.get_from_desc(desc)
}
pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
self.layer_fmgr.get_from_key(desc)
}
/// Get an immutable reference to the layer map.
///
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
@@ -369,20 +365,16 @@ impl<T> Default for LayerFileManager<T> {
}
impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
fn get_from_key(&self, key: &PersistentLayerKey) -> T {
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
// The assumption for the `expect()` is that all code maintains the following invariant:
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
self.0
.get(key)
.with_context(|| format!("get layer from key: {}", key))
.get(&desc.key())
.with_context(|| format!("get layer from desc: {}", desc.layer_name()))
.expect("not found")
.clone()
}
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
self.get_from_key(&desc.key())
}
fn contains_key(&self, key: &PersistentLayerKey) -> bool {
self.0.contains_key(key)
}

View File

@@ -618,7 +618,7 @@ impl WalIngest {
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
0
} else {
size_of::<u16>() * xlrec.ntuples as usize
std::mem::size_of::<u16>() * xlrec.ntuples as usize
};
assert_eq!(offset_array_len, buf.remaining());
@@ -685,7 +685,7 @@ impl WalIngest {
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
0
} else {
size_of::<u16>() * xlrec.ntuples as usize
std::mem::size_of::<u16>() * xlrec.ntuples as usize
};
assert_eq!(offset_array_len, buf.remaining());
@@ -752,7 +752,7 @@ impl WalIngest {
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
0
} else {
size_of::<u16>() * xlrec.ntuples as usize
std::mem::size_of::<u16>() * xlrec.ntuples as usize
};
assert_eq!(offset_array_len, buf.remaining());
@@ -920,7 +920,7 @@ impl WalIngest {
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
0
} else {
size_of::<u16>() * xlrec.ntuples as usize
std::mem::size_of::<u16>() * xlrec.ntuples as usize
};
assert_eq!(offset_array_len, buf.remaining());

View File

@@ -241,9 +241,6 @@ impl PostgresRedoManager {
/// Shut down the WAL redo manager.
///
/// Returns `true` if this call was the one that initiated shutdown.
/// `true` may be observed by no caller if the first caller stops polling.
///
/// After this future completes
/// - no redo process is running
/// - no new redo process will be spawned
@@ -253,32 +250,22 @@ impl PostgresRedoManager {
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub async fn shutdown(&self) -> bool {
pub async fn shutdown(&self) {
// prevent new processes from being spawned
let maybe_permit = match self.redo_process.get_or_init_detached().await {
let permit = match self.redo_process.get_or_init_detached().await {
Ok(guard) => {
if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
None
} else {
let (proc, permit) = guard.take_and_deinit();
drop(proc); // this just drops the Arc, its refcount may not be zero yet
Some(permit)
}
let (proc, permit) = guard.take_and_deinit();
drop(proc); // this just drops the Arc, its refcount may not be zero yet
permit
}
Err(permit) => Some(permit),
};
let it_was_us = if let Some(permit) = maybe_permit {
self.redo_process
.set(ProcessOnceCell::ManagerShutDown, permit);
true
} else {
false
Err(permit) => permit,
};
self.redo_process
.set(ProcessOnceCell::ManagerShutDown, permit);
// wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
// we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
// for the underlying process.
self.launched_processes.close().await;
it_was_us
}
/// This type doesn't have its own background task to check for idleness: we

View File

@@ -1,7 +0,0 @@
# This was captured from one shard of a large tenant in staging.
# It has a mixture of deltas and image layers, >1000 layers in total.
# This is suitable for general smoke tests that want an index which is not
# trivially small, but doesn't contain weird/pathological cases.

File diff suppressed because one or more lines are too long

153
poetry.lock generated
View File

@@ -870,96 +870,6 @@ files = [
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "clickhouse-connect"
version = "0.7.17"
description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
optional = false
python-versions = "~=3.8"
files = [
{file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
]
[package.dependencies]
certifi = "*"
lz4 = "*"
pytz = "*"
urllib3 = ">=1.26"
zstandard = "*"
[package.extras]
arrow = ["pyarrow"]
numpy = ["numpy"]
orjson = ["orjson"]
pandas = ["pandas"]
sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
tzlocal = ["tzlocal (>=4.0)"]
[[package]]
name = "colorama"
version = "0.4.5"
@@ -1560,56 +1470,6 @@ files = [
{file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
]
[[package]]
name = "lz4"
version = "4.3.3"
description = "LZ4 Bindings for Python"
optional = false
python-versions = ">=3.8"
files = [
{file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
{file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
{file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
{file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
{file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
{file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
{file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
{file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
{file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
{file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
{file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
{file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
{file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
{file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
{file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
{file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
{file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
{file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
{file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
{file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
{file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
]
[package.extras]
docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
flake8 = ["flake8"]
tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
[[package]]
name = "markupsafe"
version = "2.1.1"
@@ -2501,17 +2361,6 @@ files = [
[package.dependencies]
six = ">=1.5"
[[package]]
name = "pytz"
version = "2024.1"
description = "World timezone definitions, modern and historical"
optional = false
python-versions = "*"
files = [
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
]
[[package]]
name = "pywin32"
version = "301"
@@ -3357,4 +3206,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"

View File

@@ -1,7 +1,7 @@
//! Man-in-the-middle tests
//!
//! Channel binding should prevent a proxy server
//! *that has access to create valid certificates*
//! - that has access to create valid certificates -
//! from controlling the TLS connection.
use std::fmt::Debug;

View File

@@ -158,7 +158,7 @@ mod tests {
let N = 1021 * 4096;
let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
let memory = size_of::<u32>() * sketch.buckets.len();
let memory = std::mem::size_of::<u32>() * sketch.buckets.len();
let time = sketch.depth;
(memory, time)
}

View File

@@ -41,7 +41,6 @@ zstandard = "^0.21.0"
httpx = {extras = ["http2"], version = "^0.26.0"}
pytest-repeat = "^0.9.3"
websockets = "^12.0"
clickhouse-connect = "^0.7.16"
[tool.poetry.group.dev.dependencies]
mypy = "==1.3.0"

View File

@@ -1,5 +1,5 @@
[toolchain]
channel = "1.80.0"
channel = "1.79.0"
profile = "default"
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
# https://rust-lang.github.io/rustup/concepts/profiles.html

View File

@@ -27,7 +27,7 @@ pub const SK_FORMAT_VERSION: u32 = 9;
pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
// needed to atomically update the state using `rename`
const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial";
pub const CHECKSUM_SIZE: usize = size_of::<u32>();
pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
/// Storage should keep actual state inside of it. It should implement Deref
/// trait to access state fields and have persist method for updating that state.

View File

@@ -143,12 +143,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
self.tenant_id.unwrap_or(TenantId::from([0u8; 16])),
self.timeline_id.unwrap_or(TimelineId::from([0u8; 16])),
);
tracing::Span::current()
.record("ttid", tracing::field::display(ttid))
.record(
"application_name",
tracing::field::debug(self.appname.clone()),
);
tracing::Span::current().record("ttid", tracing::field::display(ttid));
Ok(())
} else {

View File

@@ -43,7 +43,7 @@ pub async fn task_main(
error!("connection handler exited: {}", err);
}
}
.instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)),
.instrument(info_span!("", cid = %conn_id, ttid = field::Empty)),
);
}
}

View File

@@ -172,7 +172,7 @@ fn write_walrecord_to_disk(
let mut freespace = insert_freespace(curr_ptr);
let mut written: usize = 0;
assert!(freespace >= size_of::<u32>());
assert!(freespace >= std::mem::size_of::<u32>());
for mut rdata in rdatas {
while rdata.len() >= freespace {

View File

@@ -67,7 +67,6 @@ FALLBACK_DURATION = {
"test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
"test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
"test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,

View File

@@ -18,7 +18,6 @@ anyhow.workspace = true
aws-config.workspace = true
bytes.workspace = true
camino.workspace = true
chrono.workspace = true
clap.workspace = true
fail.workspace = true
futures.workspace = true
@@ -45,12 +44,7 @@ scopeguard.workspace = true
strum.workspace = true
strum_macros.workspace = true
diesel = { version = "2.1.4", features = [
"serde_json",
"postgres",
"r2d2",
"chrono",
] }
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
diesel_migrations = { version = "2.1.0" }
r2d2 = { version = "0.8.10" }
@@ -58,3 +52,4 @@ utils = { path = "../libs/utils/" }
metrics = { path = "../libs/metrics/" }
control_plane = { path = "../control_plane" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -1 +0,0 @@
DROP TABLE metadata_health;

View File

@@ -1,14 +0,0 @@
CREATE TABLE metadata_health (
tenant_id VARCHAR NOT NULL,
shard_number INTEGER NOT NULL,
shard_count INTEGER NOT NULL,
PRIMARY KEY(tenant_id, shard_number, shard_count),
-- Rely on cascade behavior for delete
FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE,
healthy BOOLEAN NOT NULL DEFAULT TRUE,
last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
INSERT INTO metadata_health(tenant_id, shard_number, shard_count)
SELECT tenant_id, shard_number, shard_count FROM tenant_shards;

View File

@@ -3,18 +3,14 @@ use crate::metrics::{
METRICS_REGISTRY,
};
use crate::reconciler::ReconcileError;
use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
use anyhow::Context;
use futures::Future;
use hyper::header::CONTENT_TYPE;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use metrics::{BuildInfo, NeonMetrics};
use pageserver_api::controller_api::{
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
TenantCreateRequest,
};
use pageserver_api::controller_api::TenantCreateRequest;
use pageserver_api::models::{
TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest,
@@ -564,51 +560,6 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
json_response(StatusCode::ACCEPTED, ())
}
async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Scrubber)?;
let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
let state = get_state(&req);
state.service.metadata_health_update(update_req).await?;
json_response(StatusCode::OK, MetadataHealthUpdateResponse {})
}
async fn handle_metadata_health_list_unhealthy(
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
json_response(
StatusCode::OK,
MetadataHealthListUnhealthyResponse {
unhealthy_tenant_shards,
},
)
}
async fn handle_metadata_health_list_outdated(
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
let state = get_state(&req);
let health_records = state
.service
.metadata_health_list_outdated(list_outdated_req.not_scrubbed_for)
.await?;
json_response(
StatusCode::OK,
MetadataHealthListOutdatedResponse { health_records },
)
}
async fn handle_tenant_shard_split(
service: Arc<Service>,
mut req: Request<Body>,
@@ -656,13 +607,6 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
)
}
async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.step_down().await)
}
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
@@ -790,47 +734,6 @@ struct RequestMeta {
at: Instant,
}
pub fn prologue_leadership_status_check_middleware<
B: hyper::body::HttpBody + Send + Sync + 'static,
>() -> Middleware<B, ApiError> {
Middleware::pre(move |req| async move {
let state = get_state(&req);
let leadership_status = state.service.get_leadership_status();
enum AllowedRoutes<'a> {
All,
Some(Vec<&'a str>),
}
let allowed_routes = match leadership_status {
LeadershipStatus::Leader => AllowedRoutes::All,
LeadershipStatus::SteppedDown => {
// TODO: does it make sense to allow /status here?
AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec())
}
LeadershipStatus::Candidate => {
AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec())
}
};
let uri = req.uri().to_string();
match allowed_routes {
AllowedRoutes::All => Ok(req),
AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req),
_ => {
tracing::info!(
"Request {} not allowed due to current leadership state",
req.uri()
);
Err(ApiError::ResourceUnavailable(
format!("Current leadership status is {leadership_status}").into(),
))
}
}
})
}
fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
) -> Middleware<B, ApiError> {
Middleware::pre(move |req| async move {
@@ -917,7 +820,6 @@ pub fn make_router(
build_info: BuildInfo,
) -> RouterBuilder<hyper::Body, ApiError> {
let mut router = endpoint::make_router()
.middleware(prologue_leadership_status_check_middleware())
.middleware(prologue_metrics_middleware())
.middleware(epilogue_metrics_middleware());
if auth.is_some() {
@@ -1036,28 +938,6 @@ pub fn make_router(
RequestName("control_v1_cancel_node_fill"),
)
})
// Metadata health operations
.post("/control/v1/metadata_health/update", |r| {
named_request_span(
r,
handle_metadata_health_update,
RequestName("control_v1_metadata_health_update"),
)
})
.get("/control/v1/metadata_health/unhealthy", |r| {
named_request_span(
r,
handle_metadata_health_list_unhealthy,
RequestName("control_v1_metadata_health_list_unhealthy"),
)
})
.post("/control/v1/metadata_health/outdated", |r| {
named_request_span(
r,
handle_metadata_health_list_outdated,
RequestName("control_v1_metadata_health_list_outdated"),
)
})
// TODO(vlad): endpoint for cancelling drain and fill
// Tenant Shard operations
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
@@ -1091,9 +971,6 @@ pub fn make_router(
RequestName("control_v1_tenant_policy"),
)
})
.put("/control/v1/step_down", |r| {
named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
})
// Tenant operations
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.

View File

@@ -13,10 +13,7 @@ use metrics::NeonMetrics;
use once_cell::sync::Lazy;
use std::sync::Mutex;
use crate::{
persistence::{DatabaseError, DatabaseOperation},
service::LeadershipStatus,
};
use crate::persistence::{DatabaseError, DatabaseOperation};
pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
Lazy::new(StorageControllerMetrics::default);
@@ -84,8 +81,6 @@ pub(crate) struct StorageControllerMetricGroup {
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_database_query_latency:
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
}
impl StorageControllerMetrics {
@@ -161,12 +156,6 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
pub(crate) operation: DatabaseOperation,
}
#[derive(measured::LabelGroup)]
#[label(set = LeadershipStatusGroupSet)]
pub(crate) struct LeadershipStatusGroup {
pub(crate) status: LeadershipStatus,
}
#[derive(FixedCardinalityLabel, Clone, Copy)]
pub(crate) enum ReconcileOutcome {
#[label(rename = "ok")]

View File

@@ -8,7 +8,6 @@ use self::split_state::SplitState;
use diesel::pg::PgConnection;
use diesel::prelude::*;
use diesel::Connection;
use pageserver_api::controller_api::MetadataHealthRecord;
use pageserver_api::controller_api::ShardSchedulingPolicy;
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
use pageserver_api::models::TenantConfig;
@@ -91,10 +90,6 @@ pub(crate) enum DatabaseOperation {
UpdateTenantShard,
DeleteTenant,
UpdateTenantConfig,
UpdateMetadataHealth,
ListMetadataHealth,
ListMetadataHealthUnhealthy,
ListMetadataHealthOutdated,
}
#[must_use]
@@ -312,32 +307,15 @@ impl Persistence {
&self,
shards: Vec<TenantShardPersistence>,
) -> DatabaseResult<()> {
use crate::schema::metadata_health;
use crate::schema::tenant_shards;
let now = chrono::Utc::now();
let metadata_health_records = shards
.iter()
.map(|t| MetadataHealthPersistence {
tenant_id: t.tenant_id.clone(),
shard_number: t.shard_number,
shard_count: t.shard_count,
healthy: true,
last_scrubbed_at: now,
})
.collect::<Vec<_>>();
use crate::schema::tenant_shards::dsl::*;
self.with_measured_conn(
DatabaseOperation::InsertTenantShards,
move |conn| -> DatabaseResult<()> {
diesel::insert_into(tenant_shards::table)
.values(&shards)
.execute(conn)?;
diesel::insert_into(metadata_health::table)
.values(&metadata_health_records)
.execute(conn)?;
for tenant in &shards {
diesel::insert_into(tenant_shards)
.values(tenant)
.execute(conn)?;
}
Ok(())
},
)
@@ -351,10 +329,10 @@ impl Persistence {
self.with_measured_conn(
DatabaseOperation::DeleteTenant,
move |conn| -> DatabaseResult<()> {
// `metadata_health` status (if exists) is also deleted based on the cascade behavior.
diesel::delete(tenant_shards)
.filter(tenant_id.eq(del_tenant_id.to_string()))
.execute(conn)?;
Ok(())
},
)
@@ -697,94 +675,6 @@ impl Persistence {
)
.await
}
/// Stores all the latest metadata health updates durably. Updates existing entry on conflict.
///
/// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller.
#[allow(dead_code)]
pub(crate) async fn update_metadata_health_records(
&self,
healthy_records: Vec<MetadataHealthPersistence>,
unhealthy_records: Vec<MetadataHealthPersistence>,
now: chrono::DateTime<chrono::Utc>,
) -> DatabaseResult<()> {
use crate::schema::metadata_health::dsl::*;
self.with_measured_conn(
DatabaseOperation::UpdateMetadataHealth,
move |conn| -> DatabaseResult<_> {
diesel::insert_into(metadata_health)
.values(&healthy_records)
.on_conflict((tenant_id, shard_number, shard_count))
.do_update()
.set((healthy.eq(true), last_scrubbed_at.eq(now)))
.execute(conn)?;
diesel::insert_into(metadata_health)
.values(&unhealthy_records)
.on_conflict((tenant_id, shard_number, shard_count))
.do_update()
.set((healthy.eq(false), last_scrubbed_at.eq(now)))
.execute(conn)?;
Ok(())
},
)
.await
}
/// Lists all the metadata health records.
#[allow(dead_code)]
pub(crate) async fn list_metadata_health_records(
&self,
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
self.with_measured_conn(
DatabaseOperation::ListMetadataHealth,
move |conn| -> DatabaseResult<_> {
Ok(
crate::schema::metadata_health::table
.load::<MetadataHealthPersistence>(conn)?,
)
},
)
.await
}
/// Lists all the metadata health records that is unhealthy.
#[allow(dead_code)]
pub(crate) async fn list_unhealthy_metadata_health_records(
&self,
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
use crate::schema::metadata_health::dsl::*;
self.with_measured_conn(
DatabaseOperation::ListMetadataHealthUnhealthy,
move |conn| -> DatabaseResult<_> {
Ok(crate::schema::metadata_health::table
.filter(healthy.eq(false))
.load::<MetadataHealthPersistence>(conn)?)
},
)
.await
}
/// Lists all the metadata health records that have not been updated since an `earlier` time.
#[allow(dead_code)]
pub(crate) async fn list_outdated_metadata_health_records(
&self,
earlier: chrono::DateTime<chrono::Utc>,
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
use crate::schema::metadata_health::dsl::*;
self.with_measured_conn(
DatabaseOperation::ListMetadataHealthOutdated,
move |conn| -> DatabaseResult<_> {
let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
let res = query.load::<MetadataHealthPersistence>(conn)?;
Ok(res)
},
)
.await
}
}
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -854,59 +744,3 @@ pub(crate) struct NodePersistence {
pub(crate) listen_pg_addr: String,
pub(crate) listen_pg_port: i32,
}
/// Tenant metadata health status that are stored durably.
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
#[diesel(table_name = crate::schema::metadata_health)]
pub(crate) struct MetadataHealthPersistence {
#[serde(default)]
pub(crate) tenant_id: String,
#[serde(default)]
pub(crate) shard_number: i32,
#[serde(default)]
pub(crate) shard_count: i32,
pub(crate) healthy: bool,
pub(crate) last_scrubbed_at: chrono::DateTime<chrono::Utc>,
}
impl MetadataHealthPersistence {
pub fn new(
tenant_shard_id: TenantShardId,
healthy: bool,
last_scrubbed_at: chrono::DateTime<chrono::Utc>,
) -> Self {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_number = tenant_shard_id.shard_number.0 as i32;
let shard_count = tenant_shard_id.shard_count.literal() as i32;
MetadataHealthPersistence {
tenant_id,
shard_number,
shard_count,
healthy,
last_scrubbed_at,
}
}
#[allow(dead_code)]
pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
Ok(TenantShardId {
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
shard_number: ShardNumber(self.shard_number as u8),
shard_count: ShardCount::new(self.shard_count as u8),
})
}
}
impl From<MetadataHealthPersistence> for MetadataHealthRecord {
fn from(value: MetadataHealthPersistence) -> Self {
MetadataHealthRecord {
tenant_shard_id: value
.get_tenant_shard_id()
.expect("stored tenant id should be valid"),
healthy: value.healthy,
last_scrubbed_at: value.last_scrubbed_at,
}
}
}

View File

@@ -12,7 +12,6 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio_util::sync::CancellationToken;
use utils::failpoint_support;
use utils::generation::Generation;
use utils::id::{NodeId, TimelineId};
use utils::lsn::Lsn;
@@ -656,8 +655,11 @@ impl Reconciler {
// reconcile this location. This includes locations with different configurations, as well
// as locations with unknown (None) observed state.
// Incrementing generation is the safe general case, but is inefficient for changes that only
// modify some details (e.g. the tenant's config).
// The general case is to increment the generation. However, there are cases
// where this is not necessary:
// - if we are only updating the TenantConf part of the location
// - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
// and the location was already in the correct generation
let increment_generation = match observed {
None => true,
Some(ObservedStateLocation { conf: None }) => true,
@@ -666,11 +668,18 @@ impl Reconciler {
}) => {
let generations_match = observed.generation == wanted_conf.generation;
// We may skip incrementing the generation if the location is already in the expected mode and
// generation. In principle it would also be safe to skip from certain other modes (e.g. AttachedStale),
// but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up
// after a restart/crash, so fall back to the universally safe path of incrementing generation.
!generations_match || (observed.mode != wanted_conf.mode)
use LocationConfigMode::*;
let mode_transition_requires_gen_inc =
match (observed.mode, wanted_conf.mode) {
// Usually the short-lived attachment modes (multi and stale) are only used
// in the case of [`Self::live_migrate`], but it is simple to handle them correctly
// here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation.
(AttachedSingle, AttachedStale) => false,
(AttachedMulti, AttachedSingle) => false,
(lhs, rhs) => lhs != rhs,
};
!generations_match || mode_transition_requires_gen_inc
}
};
@@ -740,8 +749,6 @@ impl Reconciler {
self.location_config(&node, conf, None, false).await?;
}
failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
Ok(())
}

View File

@@ -1,15 +1,5 @@
// @generated automatically by Diesel CLI.
diesel::table! {
metadata_health (tenant_id, shard_number, shard_count) {
tenant_id -> Varchar,
shard_number -> Int4,
shard_count -> Int4,
healthy -> Bool,
last_scrubbed_at -> Timestamptz,
}
}
diesel::table! {
nodes (node_id) {
node_id -> Int8,
@@ -36,4 +26,4 @@ diesel::table! {
}
}
diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);

View File

@@ -15,8 +15,7 @@ use crate::{
},
compute_hook::NotifyError,
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
metrics::LeadershipStatusGroup,
persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
persistence::{AbortShardSplitStatus, TenantFilter},
reconciler::{ReconcileError, ReconcileUnits},
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
tenant_shard::{
@@ -33,11 +32,11 @@ use futures::{stream::FuturesUnordered, StreamExt};
use itertools::Itertools;
use pageserver_api::{
controller_api::{
MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
TenantShardMigrateResponse, UtilizationScore,
},
models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
};
@@ -82,7 +81,6 @@ use crate::{
ReconcilerWaiter, TenantShard,
},
};
use serde::{Deserialize, Serialize};
// For operations that should be quick, like attaching a new tenant
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -133,24 +131,6 @@ enum NodeOperations {
Delete,
}
/// The leadership status for the storage controller process.
/// Allowed transitions are:
/// 1. Leader -> SteppedDown
/// 2. Candidate -> Leader
#[derive(Copy, Clone, strum_macros::Display, measured::FixedCardinalityLabel)]
#[strum(serialize_all = "snake_case")]
pub(crate) enum LeadershipStatus {
/// This is the steady state where the storage controller can produce
/// side effects in the cluster.
Leader,
/// We've been notified to step down by another candidate. No reconciliations
/// take place in this state.
SteppedDown,
/// Initial state for a new storage controller instance. Will attempt to assume leadership.
#[allow(unused)]
Candidate,
}
pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
@@ -160,8 +140,6 @@ const MAX_DELAYED_RECONCILES: usize = 10000;
// Top level state available to all HTTP handlers
struct ServiceState {
leadership_status: LeadershipStatus,
tenants: BTreeMap<TenantShardId, TenantShard>,
nodes: Arc<HashMap<NodeId, Node>>,
@@ -224,21 +202,7 @@ impl ServiceState {
scheduler: Scheduler,
delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
) -> Self {
let status = &crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_leadership_status;
status.set(
LeadershipStatusGroup {
status: LeadershipStatus::Leader,
},
1,
);
Self {
// TODO: Starting up as Leader is a transient state. Once we enable rolling
// upgrades on the k8s side, we should start up as Candidate.
leadership_status: LeadershipStatus::Leader,
tenants,
nodes: Arc::new(nodes),
scheduler,
@@ -256,37 +220,6 @@ impl ServiceState {
) {
(&mut self.nodes, &mut self.tenants, &mut self.scheduler)
}
fn get_leadership_status(&self) -> LeadershipStatus {
self.leadership_status
}
fn step_down(&mut self) {
self.leadership_status = LeadershipStatus::SteppedDown;
let status = &crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_leadership_status;
status.set(
LeadershipStatusGroup {
status: LeadershipStatus::SteppedDown,
},
1,
);
status.set(
LeadershipStatusGroup {
status: LeadershipStatus::Leader,
},
0,
);
status.set(
LeadershipStatusGroup {
status: LeadershipStatus::Candidate,
},
0,
);
}
}
#[derive(Clone)]
@@ -470,30 +403,11 @@ struct ShardUpdate {
generation: Option<Generation>,
}
enum StopReconciliationsReason {
ShuttingDown,
SteppingDown,
}
impl std::fmt::Display for StopReconciliationsReason {
fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
let s = match self {
Self::ShuttingDown => "Shutting down",
Self::SteppingDown => "Stepping down",
};
write!(writer, "{}", s)
}
}
pub(crate) enum ReconcileResultRequest {
ReconcileResult(ReconcileResult),
Stop,
}
// TODO: move this into the storcon peer client when that gets added
#[derive(Serialize, Deserialize, Debug, Default)]
pub(crate) struct GlobalObservedState(HashMap<TenantShardId, ObservedState>);
impl Service {
pub fn get_config(&self) -> &Config {
&self.config
@@ -2944,7 +2858,8 @@ impl Service {
Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
}
// rest can be mapped
// rest can be mapped as usual
// FIXME: this converts some 500 to 409 which is not per openapi
other => passthrough_api_error(&node, other),
}
})
@@ -5689,22 +5604,17 @@ impl Service {
Ok(std::cmp::max(waiter_count, reconciles_spawned))
}
async fn stop_reconciliations(&self, reason: StopReconciliationsReason) {
pub async fn shutdown(&self) {
// Cancel all on-going reconciles and wait for them to exit the gate.
tracing::info!("{reason}: cancelling and waiting for in-flight reconciles");
tracing::info!("Shutting down: cancelling and waiting for in-flight reconciles");
self.reconcilers_cancel.cancel();
self.reconcilers_gate.close().await;
// Signal the background loop in [`Service::process_results`] to exit once
// it has proccessed the results from all the reconciles we cancelled earlier.
tracing::info!("{reason}: processing results from previously in-flight reconciles");
tracing::info!("Shutting down: processing results from previously in-flight reconciles");
self.result_tx.send(ReconcileResultRequest::Stop).ok();
self.result_tx.closed().await;
}
pub async fn shutdown(&self) {
self.stop_reconciliations(StopReconciliationsReason::ShuttingDown)
.await;
// Background tasks hold gate guards: this notifies them of the cancellation and
// waits for them all to complete.
@@ -6094,89 +6004,4 @@ impl Service {
Ok(())
}
/// Updates scrubber metadata health check results.
pub(crate) async fn metadata_health_update(
&self,
update_req: MetadataHealthUpdateRequest,
) -> Result<(), ApiError> {
let now = chrono::offset::Utc::now();
let (healthy_records, unhealthy_records) = {
let locked = self.inner.read().unwrap();
let healthy_records = update_req
.healthy_tenant_shards
.into_iter()
// Retain only health records associated with tenant shards managed by storage controller.
.filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
.map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now))
.collect();
let unhealthy_records = update_req
.unhealthy_tenant_shards
.into_iter()
.filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
.map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now))
.collect();
(healthy_records, unhealthy_records)
};
self.persistence
.update_metadata_health_records(healthy_records, unhealthy_records, now)
.await?;
Ok(())
}
/// Lists the tenant shards that has unhealthy metadata status.
pub(crate) async fn metadata_health_list_unhealthy(
&self,
) -> Result<Vec<TenantShardId>, ApiError> {
let result = self
.persistence
.list_unhealthy_metadata_health_records()
.await?
.iter()
.map(|p| p.get_tenant_shard_id().unwrap())
.collect();
Ok(result)
}
/// Lists the tenant shards that have not been scrubbed for some duration.
pub(crate) async fn metadata_health_list_outdated(
&self,
not_scrubbed_for: Duration,
) -> Result<Vec<MetadataHealthRecord>, ApiError> {
let earlier = chrono::offset::Utc::now() - not_scrubbed_for;
let result = self
.persistence
.list_outdated_metadata_health_records(earlier)
.await?
.into_iter()
.map(|record| record.into())
.collect();
Ok(result)
}
pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
self.inner.read().unwrap().get_leadership_status()
}
pub(crate) async fn step_down(&self) -> GlobalObservedState {
tracing::info!("Received step down request from peer");
self.inner.write().unwrap().step_down();
// TODO: would it make sense to have a time-out for this?
self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
.await;
let mut global_observed = GlobalObservedState::default();
let locked = self.inner.read().unwrap();
for (tid, tenant_shard) in locked.tenants.iter() {
global_observed
.0
.insert(*tid, tenant_shard.observed.clone());
}
global_observed
}
}

View File

@@ -18,7 +18,7 @@ use pageserver_api::{
models::{LocationConfig, LocationConfigMode, TenantConfig},
shard::{ShardIdentity, TenantShardId},
};
use serde::{Deserialize, Serialize};
use serde::Serialize;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use tracing::{instrument, Instrument};
@@ -284,7 +284,7 @@ impl Drop for IntentState {
}
}
#[derive(Default, Clone, Serialize, Deserialize, Debug)]
#[derive(Default, Clone, Serialize)]
pub(crate) struct ObservedState {
pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
}
@@ -298,7 +298,7 @@ pub(crate) struct ObservedState {
/// what it is (e.g. we failed partway through configuring it)
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
/// and that configuration will still be present unless something external interfered.
#[derive(Clone, Serialize, Deserialize, Debug)]
#[derive(Clone, Serialize)]
pub(crate) struct ObservedStateLocation {
/// If None, it means we do not know the status of this shard's location on this node, but
/// we know that we might have some state on this node.

View File

@@ -10,7 +10,6 @@ aws-smithy-async.workspace = true
either.workspace = true
tokio-rustls.workspace = true
anyhow.workspace = true
git-version.workspace = true
hex.workspace = true
humantime.workspace = true
thiserror.workspace = true

View File

@@ -40,11 +40,6 @@ impl TimelineAnalysis {
garbage_keys: Vec::new(),
}
}
/// Whether a timeline is healthy.
pub(crate) fn is_healthy(&self) -> bool {
self.errors.is_empty() && self.warnings.is_empty()
}
}
pub(crate) async fn branch_cleanup_and_check_errors(
@@ -92,8 +87,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
.push(format!("index_part.json version: {}", index_part.version()))
}
let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2);
if !newest_versions.any(|ip| ip == &index_part.version()) {
if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
info!(
"index_part.json version is not latest: {}",
index_part.version()

View File

@@ -1,13 +1,10 @@
use std::pin::pin;
use futures::{StreamExt, TryStreamExt};
use pageserver::tenant::storage_layer::LayerName;
use remote_storage::ListingMode;
use serde::{Deserialize, Serialize};
use crate::{
checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
stream_objects_with_retries, BucketConfig, NodeKind,
checks::parse_layer_object_name, init_remote, list_objects_with_retries,
metadata_stream::stream_tenants, BucketConfig, NodeKind,
};
#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -50,38 +47,45 @@ pub async fn find_large_objects(
ignore_deltas: bool,
concurrency: usize,
) -> anyhow::Result<LargeObjectListing> {
let (remote_client, target) =
init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
let tenants = pin!(stream_tenants_generic(&remote_client, &target));
let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
let objects_stream = tenants.map_ok(|tenant_shard_id| {
let mut tenant_root = target.tenant_root(&tenant_shard_id);
let remote_client = remote_client.clone();
let s3_client = s3_client.clone();
async move {
let mut objects = Vec::new();
let mut total_objects_ctr = 0u64;
// We want the objects and not just common prefixes
tenant_root.delimiter.clear();
let mut objects_stream = pin!(stream_objects_with_retries(
&remote_client,
ListingMode::NoDelimiter,
&tenant_root
));
while let Some(listing) = objects_stream.next().await {
let listing = listing?;
for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) {
let key = obj.key.to_string();
let mut continuation_token = None;
loop {
let fetch_response =
list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
.await?;
for obj in fetch_response.contents().iter().filter(|o| {
if let Some(obj_size) = o.size {
min_size as i64 <= obj_size
} else {
false
}
}) {
let key = obj.key().expect("couldn't get key").to_owned();
let kind = LargeObjectKind::from_key(&key);
if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
continue;
}
objects.push(LargeObject {
key,
size: obj.size,
size: obj.size.unwrap() as u64,
kind,
})
}
total_objects_ctr += listing.keys.len() as u64;
total_objects_ctr += fetch_response.contents().len() as u64;
match fetch_response.next_continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
}
}
Ok((tenant_shard_id, objects, total_objects_ctr))

View File

@@ -5,7 +5,6 @@
use std::{
collections::{HashMap, HashSet},
sync::Arc,
time::Duration,
};
use anyhow::Context;
@@ -19,8 +18,8 @@ use utils::id::TenantId;
use crate::{
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
init_remote_generic, list_objects_with_retries_generic,
metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
init_remote, init_remote_generic,
metadata_stream::{stream_tenant_timelines, stream_tenants},
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
};
@@ -28,11 +27,6 @@ use crate::{
enum GarbageReason {
DeletedInConsole,
MissingInConsole,
// The remaining data relates to a known deletion issue, and we're sure that purging this
// will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where
// there is nothing in a tenant path apart from a heatmap file.
KnownBug,
}
#[derive(Serialize, Deserialize, Debug)]
@@ -78,15 +72,6 @@ impl GarbageList {
}
}
/// If an entity has been identified as requiring purge due to a known bug, e.g.
/// a particular type of object left behind after an incomplete deletion.
fn append_buggy(&mut self, entity: GarbageEntity) {
self.items.push(GarbageItem {
entity,
reason: GarbageReason::KnownBug,
});
}
/// Return true if appended, false if not. False means the result was not garbage.
fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
where
@@ -153,7 +138,7 @@ async fn find_garbage_inner(
node_kind: NodeKind,
) -> anyhow::Result<GarbageList> {
// Construct clients for S3 and for Console API
let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
// Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -179,7 +164,7 @@ async fn find_garbage_inner(
// Enumerate Tenants in S3, and check if each one exists in Console
tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
let tenants = stream_tenants_generic(&remote_client, &target);
let tenants = stream_tenants(&s3_client, &target);
let tenants_checked = tenants.map_ok(|t| {
let api_client = cloud_admin_api_client.clone();
let console_cache = console_cache.clone();
@@ -234,66 +219,6 @@ async fn find_garbage_inner(
assert!(project.tenant == tenant_shard_id.tenant_id);
}
// Special case: If it's missing in console, check for known bugs that would enable us to conclusively
// identify it as purge-able anyway
if console_result.is_none() {
let timelines =
stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
.await?
.collect::<Vec<_>>()
.await;
if timelines.is_empty() {
// No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
let tenant_objects = list_objects_with_retries_generic(
&remote_client,
ListingMode::WithDelimiter,
&target.tenant_root(&tenant_shard_id),
)
.await?;
let object = tenant_objects.keys.first().unwrap();
if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
continue;
} else {
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
}
} else {
// A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
// rollout of WAL DR in which we never deleted these.
let mut any_non_initdb = false;
for timeline_r in timelines {
let timeline = timeline_r?;
let timeline_objects = list_objects_with_retries_generic(
&remote_client,
ListingMode::WithDelimiter,
&target.timeline_root(&timeline),
)
.await?;
if !timeline_objects.prefixes.is_empty() {
// Sub-paths? Unexpected
any_non_initdb = true;
} else {
let object = timeline_objects.keys.first().unwrap();
if object.key.get_path().as_str().ends_with("initdb.tar.zst") {
tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
} else {
any_non_initdb = true;
}
}
}
if any_non_initdb {
tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
} else {
tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
continue;
}
}
}
if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
tracing::debug!("Tenant {tenant_shard_id} is garbage");
} else {
@@ -331,8 +256,7 @@ async fn find_garbage_inner(
// Construct a stream of all timelines within active tenants
let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
let timelines =
active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
let timelines = timelines.try_flatten();
@@ -425,6 +349,9 @@ pub async fn get_timeline_objects(
tracing::debug!("Listing objects in timeline {ttid}");
let timeline_root = super::remote_timeline_path_id(&ttid);
// TODO: apply extra validation based on object modification time. Don't purge
// timelines whose index_part.json has been touched recently.
let list = s3_client
.list(
Some(&timeline_root),
@@ -495,7 +422,6 @@ impl DeletionProgressTracker {
pub async fn purge_garbage(
input_path: String,
mode: PurgeMode,
min_age: Duration,
dry_run: bool,
) -> anyhow::Result<()> {
let list_bytes = tokio::fs::read(&input_path).await?;
@@ -506,7 +432,7 @@ pub async fn purge_garbage(
input_path
);
let (remote_client, _target) =
let remote_client =
init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
assert_eq!(
@@ -533,7 +459,6 @@ pub async fn purge_garbage(
.filter(|i| match (&mode, &i.reason) {
(PurgeMode::DeletedAndMissing, _) => true,
(PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
(PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true,
(PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
});
@@ -562,37 +487,6 @@ pub async fn purge_garbage(
let mut progress_tracker = DeletionProgressTracker::default();
while let Some(result) = get_objects_results.next().await {
let mut object_list = result?;
// Extra safety check: even if a collection of objects is garbage, check max() of modification
// times before purging, so that if we incorrectly marked a live tenant as garbage then we would
// notice that its index has been written recently and would omit deleting it.
if object_list.is_empty() {
// Simplify subsequent code by ensuring list always has at least one item
// Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes
continue;
}
let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap();
let age = max_mtime.elapsed();
match age {
Err(_) => {
tracing::warn!("Bad last_modified time");
continue;
}
Ok(a) if a < min_age => {
// Failed age check. This doesn't mean we did something wrong: a tenant might really be garbage and recently
// written, but out of an abundance of caution we still don't purge it.
tracing::info!(
"Skipping tenant with young objects {}..{}",
object_list.first().as_ref().unwrap().key,
object_list.last().as_ref().unwrap().key
);
continue;
}
Ok(_) => {
// Passed age check
}
}
objects_to_delete.append(&mut object_list);
if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
do_delete(

View File

@@ -16,26 +16,22 @@ use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, Context};
use aws_config::retry::{RetryConfigBuilder, RetryMode};
use aws_sdk_s3::config::Region;
use aws_sdk_s3::error::DisplayErrorContext;
use aws_sdk_s3::Client;
use camino::{Utf8Path, Utf8PathBuf};
use clap::ValueEnum;
use futures::{Stream, StreamExt};
use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
use pageserver::tenant::TENANTS_SEGMENT_NAME;
use pageserver_api::shard::TenantShardId;
use remote_storage::{
GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind,
S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
};
use reqwest::Url;
use serde::{Deserialize, Serialize};
use storage_controller_client::control_api;
use tokio::io::AsyncReadExt;
use tokio_util::sync::CancellationToken;
use tracing::error;
use tracing_appender::non_blocking::WorkerGuard;
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
@@ -257,12 +253,6 @@ pub struct ControllerClientConfig {
pub controller_jwt: String,
}
impl ControllerClientConfig {
pub fn build_client(self) -> control_api::Client {
control_api::Client::new(self.controller_api, Some(self.controller_jwt))
}
}
pub struct ConsoleConfig {
pub token: String,
pub base_url: Url,
@@ -315,15 +305,8 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
}
async fn init_s3_client(bucket_region: Region) -> Client {
let mut retry_config_builder = RetryConfigBuilder::new();
retry_config_builder
.set_max_attempts(Some(3))
.set_mode(Some(RetryMode::Adaptive));
let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
.region(bucket_region)
.retry_config(retry_config_builder.build())
.load()
.await;
Client::new(&config)
@@ -336,35 +319,27 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
}
}
fn make_root_target(
bucket_name: String,
prefix_in_bucket: String,
node_kind: NodeKind,
) -> RootTarget {
let s3_target = S3Target {
bucket_name,
prefix_in_bucket,
delimiter: "/".to_string(),
};
match node_kind {
NodeKind::Pageserver => RootTarget::Pageserver(s3_target),
NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target),
}
}
async fn init_remote(
bucket_config: BucketConfig,
node_kind: NodeKind,
) -> anyhow::Result<(Arc<Client>, RootTarget)> {
let bucket_region = Region::new(bucket_config.region);
let delimiter = "/".to_string();
let s3_client = Arc::new(init_s3_client(bucket_region).await);
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
let s3_root = make_root_target(
bucket_config.bucket,
bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
node_kind,
);
let s3_root = match node_kind {
NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
bucket_name: bucket_config.bucket,
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
delimiter,
}),
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
bucket_name: bucket_config.bucket,
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
delimiter,
}),
};
Ok((s3_client, s3_root))
}
@@ -372,12 +347,12 @@ async fn init_remote(
async fn init_remote_generic(
bucket_config: BucketConfig,
node_kind: NodeKind,
) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
) -> anyhow::Result<GenericRemoteStorage> {
let endpoint = env::var("AWS_ENDPOINT_URL").ok();
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
let storage = S3Config {
bucket_name: bucket_config.bucket.clone(),
bucket_name: bucket_config.bucket,
bucket_region: bucket_config.region,
prefix_in_bucket,
endpoint,
@@ -391,13 +366,7 @@ async fn init_remote_generic(
storage: RemoteStorageKind::AwsS3(storage),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};
// We already pass the prefix to the remote client above
let prefix_in_root_target = String::new();
let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
let client = GenericRemoteStorage::from_config(&storage_config).await?;
Ok((client, s3_root))
GenericRemoteStorage::from_config(&storage_config).await
}
async fn list_objects_with_retries(
@@ -435,84 +404,6 @@ async fn list_objects_with_retries(
Err(anyhow!("unreachable unless MAX_RETRIES==0"))
}
/// Listing possibly large amounts of keys in a streaming fashion.
fn stream_objects_with_retries<'a>(
storage_client: &'a GenericRemoteStorage,
listing_mode: ListingMode,
s3_target: &'a S3Target,
) -> impl Stream<Item = Result<Listing, anyhow::Error>> + 'a {
async_stream::stream! {
let mut trial = 0;
let cancel = CancellationToken::new();
let prefix_str = &s3_target
.prefix_in_bucket
.strip_prefix("/")
.unwrap_or(&s3_target.prefix_in_bucket);
let prefix = RemotePath::from_string(prefix_str)?;
let mut list_stream =
storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
while let Some(res) = list_stream.next().await {
if let Err(err) = res {
let yield_err = if err.is_permanent() {
true
} else {
let backoff_time = 1 << trial.max(5);
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
trial += 1;
trial == MAX_RETRIES - 1
};
if yield_err {
yield Err(err)
.with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
break;
}
} else {
trial = 0;
yield res.map_err(anyhow::Error::from);
}
}
}
}
/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
/// use [`stream_objects_with_retries`] instead.
async fn list_objects_with_retries_generic(
remote_client: &GenericRemoteStorage,
listing_mode: ListingMode,
s3_target: &S3Target,
) -> anyhow::Result<Listing> {
let cancel = CancellationToken::new();
let prefix_str = &s3_target
.prefix_in_bucket
.strip_prefix("/")
.unwrap_or(&s3_target.prefix_in_bucket);
let prefix = RemotePath::from_string(prefix_str)?;
for trial in 0..MAX_RETRIES {
match remote_client
.list(Some(&prefix), listing_mode, None, &cancel)
.await
{
Ok(response) => return Ok(response),
Err(e) => {
if trial == MAX_RETRIES - 1 {
return Err(e)
.with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
}
error!(
"list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
s3_target.bucket_name,
s3_target.prefix_in_bucket,
s3_target.delimiter,
DisplayErrorContext(e),
);
let backoff_time = 1 << trial.max(5);
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
}
}
}
panic!("MAX_RETRIES is not allowed to be 0");
}
async fn download_object_with_retries(
s3_client: &Client,
bucket_name: &str,

View File

@@ -1,8 +1,7 @@
use anyhow::{anyhow, bail};
use camino::Utf8PathBuf;
use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
use pageserver_api::shard::TenantShardId;
use reqwest::{Method, Url};
use reqwest::Url;
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
use storage_scrubber::pageserver_physical_gc::GcMode;
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -17,11 +16,6 @@ use storage_scrubber::{
use clap::{Parser, Subcommand};
use utils::id::TenantId;
use utils::{project_build_tag, project_git_version};
project_git_version!(GIT_VERSION);
project_build_tag!(BUILD_TAG);
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
#[command(arg_required_else_help(true))]
@@ -56,8 +50,6 @@ enum Command {
input_path: String,
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
mode: PurgeMode,
#[arg(long = "min-age")]
min_age: humantime::Duration,
},
#[command(verbatim_doc_comment)]
ScanMetadata {
@@ -67,8 +59,6 @@ enum Command {
json: bool,
#[arg(long = "tenant-id", num_args = 0..)]
tenant_ids: Vec<TenantShardId>,
#[arg(long = "post", default_value_t = false)]
post_to_storage_controller: bool,
#[arg(long, default_value = None)]
/// For safekeeper node_kind only, points to db with debug dump
dump_db_connstr: Option<String>,
@@ -106,8 +96,6 @@ enum Command {
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
let bucket_config = BucketConfig::from_env()?;
let command_log_name = match &cli.command {
@@ -126,20 +114,11 @@ async fn main() -> anyhow::Result<()> {
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
));
let controller_client_conf = cli.controller_api.map(|controller_api| {
ControllerClientConfig {
controller_api,
// Default to no key: this is a convenience when working in a development environment
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
}
});
match cli.command {
Command::ScanMetadata {
json,
tenant_ids,
node_kind,
post_to_storage_controller,
dump_db_connstr,
dump_db_table,
} => {
@@ -178,9 +157,6 @@ async fn main() -> anyhow::Result<()> {
}
Ok(())
} else {
if controller_client_conf.is_none() && post_to_storage_controller {
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
}
match scan_metadata(bucket_config.clone(), tenant_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
@@ -192,21 +168,6 @@ async fn main() -> anyhow::Result<()> {
} else {
println!("{}", summary.summary_string());
}
if post_to_storage_controller {
if let Some(conf) = controller_client_conf {
let controller_client = conf.build_client();
let body = summary.build_health_update_request();
controller_client
.dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
Method::POST,
"control/v1/metadata_health/update".to_string(),
Some(body),
)
.await?;
}
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
@@ -235,11 +196,9 @@ async fn main() -> anyhow::Result<()> {
let console_config = ConsoleConfig::from_env()?;
find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
}
Command::PurgeGarbage {
input_path,
mode,
min_age,
} => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await,
Command::PurgeGarbage { input_path, mode } => {
purge_garbage(input_path, mode, !cli.delete).await
}
Command::TenantSnapshot {
tenant_id,
output_path,
@@ -254,6 +213,14 @@ async fn main() -> anyhow::Result<()> {
min_age,
mode,
} => {
let controller_client_conf = cli.controller_api.map(|controller_api| {
ControllerClientConfig {
controller_api,
// Default to no key: this is a convenience when working in a development environment
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
}
});
match (&controller_client_conf, mode) {
(Some(_), _) => {
// Any mode may run when controller API is set

View File

@@ -1,41 +1,12 @@
use std::str::FromStr;
use anyhow::{anyhow, Context};
use anyhow::Context;
use async_stream::{stream, try_stream};
use aws_sdk_s3::{types::ObjectIdentifier, Client};
use futures::StreamExt;
use remote_storage::{GenericRemoteStorage, ListingMode};
use tokio_stream::Stream;
use crate::{
list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
TenantShardTimelineId,
};
use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
use pageserver_api::shard::TenantShardId;
use utils::id::{TenantId, TimelineId};
/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
pub fn stream_tenants_generic<'a>(
remote_client: &'a GenericRemoteStorage,
target: &'a RootTarget,
) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
try_stream! {
let tenants_target = target.tenants_root();
let mut tenants_stream =
std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
while let Some(chunk) = tenants_stream.next().await {
let chunk = chunk?;
let entry_ids = chunk.prefixes.iter()
.map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'")));
for dir_name_res in entry_ids {
let dir_name = dir_name_res?;
let id = TenantShardId::from_str(dir_name)?;
yield id;
}
}
}
}
/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
pub fn stream_tenants<'a>(
s3_client: &'a Client,
@@ -189,63 +160,6 @@ pub async fn stream_tenant_timelines<'a>(
})
}
/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
/// using a listing. The listing is done before the stream is built, so that this
/// function can be used to generate concurrency on a stream using buffer_unordered.
pub async fn stream_tenant_timelines_generic<'a>(
remote_client: &'a GenericRemoteStorage,
target: &'a RootTarget,
tenant: TenantShardId,
) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
let timelines_target = target.timelines_root(&tenant);
let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
remote_client,
ListingMode::WithDelimiter,
&timelines_target
));
loop {
tracing::debug!("Listing in {tenant}");
let fetch_response = match objects_stream.next().await {
None => break,
Some(Err(e)) => {
timeline_ids.push(Err(e));
break;
}
Some(Ok(r)) => r,
};
let new_entry_ids = fetch_response
.prefixes
.iter()
.filter_map(|prefix| -> Option<&str> {
prefix
.get_path()
.as_str()
.strip_prefix(&timelines_target.prefix_in_bucket)?
.strip_suffix('/')
})
.map(|entry_id_str| {
entry_id_str
.parse::<TimelineId>()
.with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
});
for i in new_entry_ids {
timeline_ids.push(i);
}
}
tracing::debug!("Yielding for {}", tenant);
Ok(stream! {
for i in timeline_ids {
let id = i?;
yield Ok(TenantShardTimelineId::new(tenant, id));
}
})
}
pub(crate) fn stream_listing<'a>(
s3_client: &'a Client,
target: &'a S3Target,

View File

@@ -567,7 +567,13 @@ pub async fn pageserver_physical_gc(
}
// Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
let ControllerClientConfig {
controller_api,
controller_jwt,
} = c;
control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
}) else {
tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
return Ok(summary);
};

View File

@@ -9,13 +9,12 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
use aws_sdk_s3::Client;
use futures_util::{StreamExt, TryStreamExt};
use pageserver::tenant::remote_timeline_client::remote_layer_path;
use pageserver_api::controller_api::MetadataHealthUpdateRequest;
use pageserver_api::shard::TenantShardId;
use serde::Serialize;
use utils::id::TenantId;
use utils::shard::ShardCount;
#[derive(Serialize, Default)]
#[derive(Serialize)]
pub struct MetadataSummary {
tenant_count: usize,
timeline_count: usize,
@@ -24,16 +23,19 @@ pub struct MetadataSummary {
with_warnings: HashSet<TenantShardTimelineId>,
with_orphans: HashSet<TenantShardTimelineId>,
indices_by_version: HashMap<usize, usize>,
#[serde(skip)]
pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
#[serde(skip)]
pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
}
impl MetadataSummary {
fn new() -> Self {
Self::default()
Self {
tenant_count: 0,
timeline_count: 0,
timeline_shard_count: 0,
with_errors: HashSet::new(),
with_warnings: HashSet::new(),
with_orphans: HashSet::new(),
indices_by_version: HashMap::new(),
}
}
fn update_data(&mut self, data: &S3TimelineBlobData) {
@@ -52,13 +54,6 @@ impl MetadataSummary {
}
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
if analysis.is_healthy() {
self.healthy_tenant_shards.insert(id.tenant_shard_id);
} else {
self.healthy_tenant_shards.remove(&id.tenant_shard_id);
self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
}
if !analysis.errors.is_empty() {
self.with_errors.insert(*id);
}
@@ -106,13 +101,6 @@ Index versions: {version_summary}
pub fn is_empty(&self) -> bool {
self.timeline_shard_count == 0
}
pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
MetadataHealthUpdateRequest {
healthy_tenant_shards: self.healthy_tenant_shards.clone(),
unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
}
}
}
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.

View File

@@ -81,7 +81,7 @@ should go.
Useful parameters and commands:
`--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk
after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. If `NeonEnvBuilder#preserve_database_files` set to `True` for a particular test, the whole `repo` directory will be attached to Allure report (thus uploaded to S3) as `everything.tar.zst` for this test.
after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents.
Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them:
`./scripts/pytest -s --log-cli-level=INFO ...`

View File

@@ -222,8 +222,6 @@ class NeonBenchmarker:
function by the zenbenchmark fixture
"""
PROPERTY_PREFIX = "neon_benchmarker_"
def __init__(self, property_recorder: Callable[[str, object], None]):
# property recorder here is a pytest fixture provided by junitxml module
# https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property
@@ -240,7 +238,7 @@ class NeonBenchmarker:
Record a benchmark result.
"""
# just to namespace the value
name = f"{self.PROPERTY_PREFIX}_{metric_name}"
name = f"neon_benchmarker_{metric_name}"
self.property_recorder(
name,
{
@@ -251,18 +249,6 @@ class NeonBenchmarker:
},
)
@classmethod
def records(
cls, user_properties: list[tuple[str, object]]
) -> Iterator[tuple[str, dict[str, object]]]:
"""
Yield all records related to benchmarks
"""
for property_name, recorded_property in user_properties:
if property_name.startswith(cls.PROPERTY_PREFIX):
assert isinstance(recorded_property, dict)
yield recorded_property["name"], recorded_property
@contextmanager
def record_duration(self, metric_name: str) -> Iterator[None]:
"""
@@ -439,11 +425,10 @@ def zenbenchmark(
yield benchmarker
results = {}
for _, recorded_property in NeonBenchmarker.records(request.node.user_properties):
for _, recorded_property in request.node.user_properties:
name = recorded_property["name"]
value = str(recorded_property["value"])
unit = str(recorded_property["unit"]).strip()
if unit != "":
if (unit := recorded_property["unit"].strip()) != "":
value += f" {unit}"
results[name] = value
@@ -492,7 +477,7 @@ def pytest_terminal_summary(
for test_report in terminalreporter.stats.get("passed", []):
result_entry = []
for _, recorded_property in NeonBenchmarker.records(test_report.user_properties):
for _, recorded_property in test_report.user_properties:
if not is_header_printed:
terminalreporter.section("Benchmark results", "-")
is_header_printed = True

View File

@@ -150,7 +150,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_pitr_history_size",
"pageserver_layer_bytes",
"pageserver_layer_count",
"pageserver_visible_physical_size",
"pageserver_storage_operations_seconds_count_total",
"pageserver_storage_operations_seconds_sum_total",
"pageserver_evictions_total",

View File

@@ -449,7 +449,6 @@ class TokenScope(str, Enum):
GENERATIONS_API = "generations_api"
SAFEKEEPER_DATA = "safekeeperdata"
TENANT = "tenant"
SCRUBBER = "scrubber"
class NeonEnvBuilder:
@@ -543,6 +542,21 @@ class NeonEnvBuilder:
f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
)
self.pageserver_get_vectored_impl: Optional[str] = None
if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored":
self.pageserver_get_vectored_impl = "vectored"
log.debug('Overriding pageserver get_vectored_impl config to "vectored"')
self.pageserver_get_impl: Optional[str] = None
if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored":
self.pageserver_get_impl = "vectored"
log.debug('Overriding pageserver get_impl config to "vectored"')
self.pageserver_validate_vectored_get: Optional[bool] = None
if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None:
self.pageserver_validate_vectored_get = bool(validate)
log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"')
self.pageserver_aux_file_policy = pageserver_aux_file_policy
self.safekeeper_extra_opts = safekeeper_extra_opts
@@ -1143,6 +1157,12 @@ class NeonEnv:
}
if self.pageserver_virtual_file_io_engine is not None:
ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
if config.pageserver_get_vectored_impl is not None:
ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
if config.pageserver_get_impl is not None:
ps_cfg["get_impl"] = config.pageserver_get_impl
if config.pageserver_validate_vectored_get is not None:
ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
if config.pageserver_default_tenant_config_compaction_algorithm is not None:
tenant_config = ps_cfg.setdefault("tenant_config", {})
tenant_config[
@@ -1395,7 +1415,7 @@ def _shared_simple_env(
pg_distrib_dir=pg_distrib_dir,
pg_version=pg_version,
run_id=run_id,
preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
test_name=request.node.name,
test_output_dir=test_output_dir,
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
@@ -1442,7 +1462,6 @@ def neon_env_builder(
pageserver_virtual_file_io_engine: str,
pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
pageserver_aux_file_policy: Optional[AuxFileStore],
record_property: Callable[[str, object], None],
) -> Iterator[NeonEnvBuilder]:
"""
Fixture to create a Neon environment for test.
@@ -1471,7 +1490,7 @@ def neon_env_builder(
pg_version=pg_version,
broker=default_broker,
run_id=run_id,
preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
test_name=request.node.name,
test_output_dir=test_output_dir,
@@ -1480,9 +1499,6 @@ def neon_env_builder(
pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
) as builder:
yield builder
# Propogate `preserve_database_files` to make it possible to use in other fixtures,
# like `test_output_dir` fixture for attaching all database files to Allure report.
record_property("preserve_database_files", builder.preserve_database_files)
@dataclass
@@ -2587,62 +2603,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
time.sleep(backoff)
def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]):
body: Dict[str, Any] = {
"healthy_tenant_shards": [str(t) for t in healthy],
"unhealthy_tenant_shards": [str(t) for t in unhealthy],
}
self.request(
"POST",
f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
json=body,
headers=self.headers(TokenScope.SCRUBBER),
)
def metadata_health_list_unhealthy(self):
response = self.request(
"GET",
f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
def metadata_health_list_outdated(self, duration: str):
body: Dict[str, Any] = {"not_scrubbed_for": duration}
response = self.request(
"POST",
f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
json=body,
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool:
"""Metadata is healthy if there is no unhealthy or outdated health records."""
unhealthy = self.metadata_health_list_unhealthy()
outdated = self.metadata_health_list_outdated(outdated_duration)
healthy = (
len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0
)
if not healthy:
log.info(f"{unhealthy=}, {outdated=}")
return healthy
def step_down(self):
log.info("Asking storage controller to step down")
response = self.request(
"PUT",
f"{self.env.storage_controller_api}/control/v1/step_down",
headers=self.headers(TokenScope.ADMIN),
)
response.raise_for_status()
return response.json()
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
if isinstance(config_strings, tuple):
pairs = [config_strings]
@@ -4401,11 +4361,10 @@ class StorageScrubber:
assert stdout is not None
return stdout
def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
if post_to_storage_controller:
args.append("--post")
stdout = self.scrubber_cli(args, timeout=30)
def scan_metadata(self) -> Any:
stdout = self.scrubber_cli(
["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
)
try:
return json.loads(stdout)
@@ -4529,23 +4488,7 @@ def test_output_dir(
yield test_dir
# Allure artifacts creation might involve the creation of `.tar.zst` archives,
# which aren't going to be used if Allure results collection is not enabled
# (i.e. --alluredir is not set).
# Skip `allure_attach_from_dir` in this case
if not request.config.getoption("--alluredir"):
return
preserve_database_files = False
for k, v in request.node.user_properties:
# NB: the neon_env_builder fixture uses this fixture (test_output_dir).
# So, neon_env_builder's cleanup runs before here.
# The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property.
if k == "preserve_database_files":
assert isinstance(v, bool)
preserve_database_files = v
allure_attach_from_dir(test_dir, preserve_database_files)
allure_attach_from_dir(test_dir)
class FileAndThreadLock:

View File

@@ -240,18 +240,9 @@ ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
)
def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False):
def allure_attach_from_dir(dir: Path):
"""Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report"""
if preserve_database_files:
zst_file = dir.with_suffix(".tar.zst")
with zst_file.open("wb") as zst:
cctx = zstandard.ZstdCompressor()
with cctx.stream_writer(zst) as compressor:
with tarfile.open(fileobj=compressor, mode="w") as tar:
tar.add(dir, arcname="")
allure.attach.file(zst_file, "everything.tar.zst", "application/zstd", "tar.zst")
for attachment in Path(dir).glob("**/*"):
if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0:
name = str(attachment.relative_to(dir))

View File

@@ -1,88 +0,0 @@
"""
Test the logical replication in Neon with the different consumers
"""
import hashlib
import time
import clickhouse_connect
import psycopg2
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import RemotePostgres
from fixtures.utils import wait_until
def query_clickhouse(
client,
query: str,
digest: str,
) -> None:
"""
Run the query on the client
return answer if successful, raise an exception otherwise
"""
log.debug("Query: %s", query)
res = client.query(query)
log.debug(res.result_rows)
m = hashlib.sha1()
m.update(repr(tuple(res.result_rows)).encode())
hash_res = m.hexdigest()
log.debug("Hash: %s", hash_res)
if hash_res == digest:
return
raise ValueError("Hash mismatch")
@pytest.mark.remote_cluster
def test_clickhouse(remote_pg: RemotePostgres):
"""
Test the logical replication having ClickHouse as a client
"""
conn_options = remote_pg.conn_options()
for _ in range(5):
try:
conn = psycopg2.connect(remote_pg.connstr())
except psycopg2.OperationalError as perr:
log.debug(perr)
time.sleep(1)
else:
break
raise TimeoutError
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS table1")
cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
conn.commit()
client = clickhouse_connect.get_client(host="clickhouse")
client.command("SET allow_experimental_database_materialized_postgresql=1")
client.command(
"CREATE DATABASE db1_postgres ENGINE = "
f"MaterializedPostgreSQL('{conn_options['host']}', "
f"'{conn_options['dbname']}', "
f"'{conn_options['user']}', '{conn_options['password']}') "
"SETTINGS materialized_postgresql_tables_list = 'table1';"
)
wait_until(
120,
0.5,
lambda: query_clickhouse(
client,
"select * from db1_postgres.table1 order by 1",
"ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
),
)
cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
conn.commit()
wait_until(
120,
0.5,
lambda: query_clickhouse(
client,
"select * from db1_postgres.table1 order by 1",
"9eba2daaf7e4d7d27ac849525f68b562ab53947d",
),
)
log.debug("Sleeping before final checking if Neon is still alive")
time.sleep(3)
cur.execute("SELECT 1")

View File

@@ -6,8 +6,21 @@ from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str):
assert mode == "normal" or mode == "with_snapshots"
@pytest.mark.timeout(10000)
def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
"""
Test that GC is able to collect all old layers even if them are forming
"stairs" and there are not three delta layers since last image layer.
Information about image layers needed to collect old layers should
be propagated by GC to compaction task which should take in in account
when make a decision which new image layers needs to be created.
NB: this test demonstrates the problem. The source tree contained the
`gc_feedback` mechanism for about 9 months, but, there were problems
with it and it wasn't enabled at runtime.
This PR removed the code: https://github.com/neondatabase/neon/pull/6863
"""
env = neon_env_builder.init_start()
client = env.pageserver.http_client()
@@ -61,9 +74,6 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
log.info(f"Physical storage size {physical_size}")
if mode == "with_snapshots":
if step == n_steps / 2:
env.neon_cli.create_branch("child")
max_num_of_deltas_above_image = 0
max_total_num_of_deltas = 0
@@ -139,37 +149,3 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
log.info(f"Writing layer map to {layer_map_path}")
with layer_map_path.open("w") as f:
f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
@pytest.mark.timeout(10000)
def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
"""
Test that GC is able to collect all old layers even if them are forming
"stairs" and there are not three delta layers since last image layer.
Information about image layers needed to collect old layers should
be propagated by GC to compaction task which should take in in account
when make a decision which new image layers needs to be created.
NB: this test demonstrates the problem. The source tree contained the
`gc_feedback` mechanism for about 9 months, but, there were problems
with it and it wasn't enabled at runtime.
This PR removed the code: https://github.com/neondatabase/neon/pull/6863
And the bottom-most GC-compaction epic resolves the problem.
https://github.com/neondatabase/neon/issues/8002
"""
gc_feedback_impl(neon_env_builder, zenbenchmark, "normal")
@pytest.mark.timeout(10000)
def test_gc_feedback_with_snapshots(
neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
):
"""
Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle
of the benchmark, and the bottom-most compaction should collect as much garbage as possible below the GC
horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point,
and images covering the full key range (in a delta layer) at the GC horizon.
"""
gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots")

View File

@@ -389,11 +389,6 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
repeat_result = ps_http.timeline_create(
env.pg_version, env.initial_tenant, success_timeline, timeout=60
)
# remote_consistent_lsn_visible will be published only after we've
# confirmed the generation, which is not part of what we await during
# timeline creation (uploads). mask it out here to avoid flakyness.
del success_result["remote_consistent_lsn_visible"]
del repeat_result["remote_consistent_lsn_visible"]
assert repeat_result == success_result
finally:
env.pageserver.stop(immediate=True)

View File

@@ -17,17 +17,22 @@ from fixtures.pg_version import PgVersion
# Test restarting page server, while safekeeper and compute node keep
# running.
def test_local_corruption(neon_env_builder: NeonEnvBuilder):
if neon_env_builder.pageserver_get_impl == "vectored":
reconstruct_function_name = "get_values_reconstruct_data"
else:
reconstruct_function_name = "get_value_reconstruct_data"
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[
".*get_values_reconstruct_data for layer .*",
f".*{reconstruct_function_name} for layer .*",
".*could not find data for key.*",
".*is not active. Current state: Broken.*",
".*will not become active. Current state: Broken.*",
".*failed to load metadata.*",
".*load failed.*load local timeline.*",
".*: layer load failed, assuming permanent failure:.*",
".*layer loading failed permanently: load layer: .*",
]
)
@@ -74,7 +79,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
# (We don't check layer file contents on startup, when loading the timeline)
#
# This will change when we implement checksums for layers
with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err:
with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err:
pg1.start()
log.info(
f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}"

Some files were not shown because too many files have changed in this diff Show More