mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-30 19:40:39 +00:00
Compare commits
95 Commits
fcdm/oidc
...
joonas/nth
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dcfd92b6db | ||
|
|
f58636ffdd | ||
|
|
f3ac5bcbe1 | ||
|
|
eb3711b881 | ||
|
|
c864166b32 | ||
|
|
ce9b5ae7bf | ||
|
|
cd2cbe0691 | ||
|
|
7f241bd379 | ||
|
|
ff52901028 | ||
|
|
bb377a3544 | ||
|
|
5ece7af497 | ||
|
|
2be3027fa5 | ||
|
|
14a0517c7f | ||
|
|
dcff25c293 | ||
|
|
f80c37b733 | ||
|
|
b9d0b26cea | ||
|
|
c2c28f211b | ||
|
|
1ebcb1c45b | ||
|
|
66d750ec20 | ||
|
|
ba3a6645e7 | ||
|
|
8885a8c482 | ||
|
|
c8880b69fb | ||
|
|
274b2a611b | ||
|
|
a7153bf9b2 | ||
|
|
8a4236a441 | ||
|
|
7ec927e43b | ||
|
|
22470ef444 | ||
|
|
8248cbb45b | ||
|
|
4dd805b68a | ||
|
|
f582675452 | ||
|
|
48069f68bb | ||
|
|
8f52139913 | ||
|
|
fc4d80bbf2 | ||
|
|
dc83a5a978 | ||
|
|
f4fb08d869 | ||
|
|
75b326faf4 | ||
|
|
c23cd5c149 | ||
|
|
f4cd9fe40b | ||
|
|
43af9484c0 | ||
|
|
842bd4c2db | ||
|
|
ada9a46dca | ||
|
|
742fcac7b9 | ||
|
|
55aeeb5765 | ||
|
|
89426570d3 | ||
|
|
7f767ca18e | ||
|
|
1348dbf0f1 | ||
|
|
a179283f86 | ||
|
|
deb86c1ea1 | ||
|
|
dfdf40916f | ||
|
|
c6d8015fe9 | ||
|
|
b2233d557b | ||
|
|
ce2552ba67 | ||
|
|
f4d773bb89 | ||
|
|
6f28263428 | ||
|
|
1e380ea5af | ||
|
|
8258385301 | ||
|
|
6a8f00dea0 | ||
|
|
44cdb9fb58 | ||
|
|
cdfaf0700f | ||
|
|
881e1ad056 | ||
|
|
bb3d70e24d | ||
|
|
c6c560e4c8 | ||
|
|
8dd332aed5 | ||
|
|
5c03a17eb8 | ||
|
|
402d66778e | ||
|
|
39e2bc932f | ||
|
|
5fc034fa7f | ||
|
|
f9b12def0b | ||
|
|
5d0071447c | ||
|
|
d9eba3f8c3 | ||
|
|
409e2eff9e | ||
|
|
e6e3b9a716 | ||
|
|
7f31a3f671 | ||
|
|
9971ae3d24 | ||
|
|
48a2a20de3 | ||
|
|
29ef8f15ce | ||
|
|
5e45dd3f86 | ||
|
|
5fced442d7 | ||
|
|
4222610233 | ||
|
|
92deb0dfd7 | ||
|
|
46ca6f17c5 | ||
|
|
14869abb77 | ||
|
|
5330fd9366 | ||
|
|
6c5b3b7812 | ||
|
|
849fe0f191 | ||
|
|
f564b66f21 | ||
|
|
2e58ccee78 | ||
|
|
f398ab0264 | ||
|
|
f23ee2ccdb | ||
|
|
0ad31bb7fb | ||
|
|
86f26d0918 | ||
|
|
4a562dff2e | ||
|
|
f9185b42a9 | ||
|
|
d4f30daa81 | ||
|
|
97ab53e826 |
2
.github/actionlint.yml
vendored
2
.github/actionlint.yml
vendored
@@ -8,8 +8,6 @@ self-hosted-runner:
|
||||
- small-arm64
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- BENCHMARK_PROJECT_ID_PUB
|
||||
- BENCHMARK_PROJECT_ID_SUB
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
|
||||
12
.github/actions/neon-project-create/action.yml
vendored
12
.github/actions/neon-project-create/action.yml
vendored
@@ -14,8 +14,11 @@ inputs:
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
provisioner:
|
||||
description: 'k8s-pod or k8s-neonvm'
|
||||
default: 'k8s-pod'
|
||||
compute_units:
|
||||
description: '[Min, Max] compute units'
|
||||
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
||||
default: '[1, 1]'
|
||||
|
||||
outputs:
|
||||
@@ -34,6 +37,10 @@ runs:
|
||||
# A shell without `set -x` to not to expose password/dsn in logs
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
|
||||
echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
|
||||
fi
|
||||
|
||||
project=$(curl \
|
||||
"https://${API_HOST}/api/v2/projects" \
|
||||
--fail \
|
||||
@@ -45,7 +52,7 @@ runs:
|
||||
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
|
||||
\"pg_version\": ${POSTGRES_VERSION},
|
||||
\"region_id\": \"${REGION_ID}\",
|
||||
\"provisioner\": \"k8s-neonvm\",
|
||||
\"provisioner\": \"${PROVISIONER}\",
|
||||
\"autoscaling_limit_min_cu\": ${MIN_CU},
|
||||
\"autoscaling_limit_max_cu\": ${MAX_CU},
|
||||
\"settings\": { }
|
||||
@@ -68,5 +75,6 @@ runs:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
REGION_ID: ${{ inputs.region_id }}
|
||||
POSTGRES_VERSION: ${{ inputs.postgres_version }}
|
||||
PROVISIONER: ${{ inputs.provisioner }}
|
||||
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
|
||||
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
|
||||
|
||||
@@ -131,8 +131,8 @@ runs:
|
||||
exit 1
|
||||
fi
|
||||
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
|
||||
# -n sets the number of parallel processes that pytest-xdist will run
|
||||
EXTRA_PARAMS="-n12 $EXTRA_PARAMS"
|
||||
# -n16 uses sixteen processes to run tests via pytest-xdist
|
||||
EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
|
||||
|
||||
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
|
||||
# to the same worker to make @pytest.mark.order work with xdist
|
||||
|
||||
11
.github/workflows/_build-and-test-locally.yml
vendored
11
.github/workflows/_build-and-test-locally.yml
vendored
@@ -19,10 +19,6 @@ on:
|
||||
description: 'debug or release'
|
||||
required: true
|
||||
type: string
|
||||
pg-versions:
|
||||
description: 'a json array of postgres versions to run regression tests on'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -258,7 +254,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pg_version: ${{ fromJson(inputs.pg-versions) }}
|
||||
pg_version: [ v14, v15, v16 ]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -282,11 +278,14 @@ jobs:
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ inputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
PAGESERVER_GET_IMPL: vectored
|
||||
PAGESERVER_VALIDATE_VEC_GET: true
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
- name: Merge and upload coverage data
|
||||
if: |
|
||||
false &&
|
||||
inputs.build-type == 'debug' && matrix.pg_version == 'v16'
|
||||
inputs.build-type == 'debug' && matrix.pg_version == 'v14'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
107
.github/workflows/benchmarking.yml
vendored
107
.github/workflows/benchmarking.yml
vendored
@@ -56,23 +56,18 @@ concurrency:
|
||||
jobs:
|
||||
bench:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write # Required for OIDC authentication.
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
# - DEFAULT_PG_VERSION: 16
|
||||
# PLATFORM: "neon-staging"
|
||||
# region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
# RUNNER: [ self-hosted, us-east-2, x64 ]
|
||||
# IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "neon-staging"
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
provisioner: 'k8s-pod'
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "azure-staging"
|
||||
region_id: 'azure-eastus2'
|
||||
RUNNER: [ self-hosted, eastus2, azure ]
|
||||
IMAGE: neondatabase/build-tools:pinned
|
||||
provisioner: 'k8s-neonvm'
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
@@ -83,20 +78,14 @@ jobs:
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.PLATFORM }}
|
||||
|
||||
runs-on: ${{ matrix.RUNNER }}
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: ${{ matrix.IMAGE }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: us-east-2
|
||||
role-to-assume: ${{ secrets.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
- run: aws sts get-caller-identity
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -111,6 +100,7 @@ jobs:
|
||||
region_id: ${{ matrix.region_id }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
provisioner: ${{ matrix.provisioner }}
|
||||
|
||||
- name: Run benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -160,7 +150,7 @@ jobs:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
@@ -181,7 +171,7 @@ jobs:
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Run Logical Replication benchmarks
|
||||
- name: Run benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
@@ -189,15 +179,12 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
|
||||
BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}
|
||||
|
||||
- name: Run Physical Replication benchmarks
|
||||
- name: Run benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
@@ -229,11 +216,11 @@ jobs:
|
||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||
#
|
||||
# Available platforms:
|
||||
# - neonvm-captest-new: Freshly created project (1 CU)
|
||||
# - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
|
||||
# - neon-captest-new: Freshly created project (1 CU)
|
||||
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
|
||||
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
|
||||
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
|
||||
# - neonvm-captest-reuse: Reusing existing project
|
||||
# - neon-captest-reuse: Reusing existing project
|
||||
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
env:
|
||||
@@ -250,8 +237,6 @@ jobs:
|
||||
id: pgbench-compare-matrix
|
||||
run: |
|
||||
region_id_default=${{ env.DEFAULT_REGION_ID }}
|
||||
runner_default="[ self-hosted, us-east-2, x64 ]"
|
||||
image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
|
||||
matrix='{
|
||||
"pg_version" : [
|
||||
16
|
||||
@@ -260,24 +245,23 @@ jobs:
|
||||
"'"$region_id_default"'"
|
||||
],
|
||||
"platform": [
|
||||
"neonvm-captest-new",
|
||||
"neonvm-captest-reuse",
|
||||
"neon-captest-new",
|
||||
"neon-captest-reuse",
|
||||
"neonvm-captest-new"
|
||||
],
|
||||
"db_size": [ "10gb" ],
|
||||
"runner": [ "'"$runner_default"'" ],
|
||||
"image": [ "'"$image_default"'" ],
|
||||
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": "'"$runner_default"'", "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": "'"$runner_default"'", "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": "'"$runner_default"'", "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": "[ self-hosted, eastus2, azure ]", "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": "[ self-hosted, eastus2, azure ]", "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": "[ self-hosted, eastus2, azure ]", "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb",,"runner": "'"$runner_default"'", "image": "'"$image_default"'" }]
|
||||
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": "'"$runner_default"'", "image": "'"$image_default"'" }]')
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -287,7 +271,7 @@ jobs:
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neonvm-captest-reuse"
|
||||
"neon-captest-reuse"
|
||||
]
|
||||
}'
|
||||
|
||||
@@ -303,7 +287,7 @@ jobs:
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neonvm-captest-reuse"
|
||||
"neon-captest-reuse"
|
||||
],
|
||||
"scale": [
|
||||
"10"
|
||||
@@ -335,9 +319,9 @@ jobs:
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
runs-on: ${{ matrix.runner }}
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: ${{ matrix.image }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
@@ -354,7 +338,7 @@ jobs:
|
||||
prefix: latest
|
||||
|
||||
- name: Create Neon Project
|
||||
if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
@@ -362,18 +346,19 @@ jobs:
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
|
||||
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neonvm-captest-reuse)
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
neonvm-captest-sharding-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
||||
;;
|
||||
neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
|
||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -457,13 +442,9 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- PLATFORM: "neonvm-captest-pgvector"
|
||||
RUNNER: [ self-hosted, us-east-2, x64 ]
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
- PLATFORM: "neon-captest-pgvector"
|
||||
- PLATFORM: "azure-captest-pgvector"
|
||||
RUNNER: [ self-hosted, eastus2, azure ]
|
||||
IMAGE: neondatabase/build-tools:pinned
|
||||
|
||||
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "1"
|
||||
@@ -475,9 +456,9 @@ jobs:
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.PLATFORM }}
|
||||
|
||||
runs-on: ${{ matrix.RUNNER }}
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: ${{ matrix.IMAGE }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -505,7 +486,7 @@ jobs:
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neonvm-captest-pgvector)
|
||||
neon-captest-pgvector)
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||
;;
|
||||
azure-captest-pgvector)
|
||||
@@ -604,7 +585,7 @@ jobs:
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neonvm-captest-reuse)
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -614,7 +595,7 @@ jobs:
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -691,7 +672,7 @@ jobs:
|
||||
- name: Get Connstring Secret Name
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neonvm-captest-reuse)
|
||||
neon-captest-reuse)
|
||||
ENV_PLATFORM=CAPTEST_TPCH
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -701,7 +682,7 @@ jobs:
|
||||
ENV_PLATFORM=RDS_AURORA_TPCH
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -778,7 +759,7 @@ jobs:
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neonvm-captest-reuse)
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -788,7 +769,7 @@ jobs:
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
35
.github/workflows/build_and_test.yml
vendored
35
.github/workflows/build_and_test.yml
vendored
@@ -203,8 +203,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64 ]
|
||||
# Do not build or run tests in debug for release branches
|
||||
build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
|
||||
build-type: [ debug, release ]
|
||||
include:
|
||||
- build-type: release
|
||||
arch: arm64
|
||||
@@ -214,8 +213,6 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
|
||||
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
|
||||
secrets: inherit
|
||||
|
||||
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
|
||||
@@ -289,6 +286,9 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
PAGESERVER_GET_IMPL: vectored
|
||||
PAGESERVER_VALIDATE_VEC_GET: false
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
@@ -309,7 +309,7 @@ jobs:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
|
||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
|
||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||
outputs:
|
||||
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
@@ -836,9 +836,6 @@ jobs:
|
||||
rm -rf .docker-custom
|
||||
|
||||
promote-images:
|
||||
permissions:
|
||||
contents: read # This is required for actions/checkout
|
||||
id-token: write # This is required for Azure Login to work.
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -865,28 +862,6 @@ jobs:
|
||||
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
- name: Azure login
|
||||
if: github.ref_name == 'main'
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
|
||||
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
|
||||
- name: Login to ACR
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
az acr login --name=neoneastus2
|
||||
|
||||
- name: Copy docker images to ACR-dev
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
|
||||
docker buildx imagetools create \
|
||||
-t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
|
||||
96
.github/workflows/pg-clients.yml
vendored
96
.github/workflows/pg-clients.yml
vendored
@@ -13,7 +13,6 @@ on:
|
||||
paths:
|
||||
- '.github/workflows/pg-clients.yml'
|
||||
- 'test_runner/pg_clients/**'
|
||||
- 'test_runner/logical_repl/**'
|
||||
- 'poetry.lock'
|
||||
workflow_dispatch:
|
||||
|
||||
@@ -50,101 +49,6 @@ jobs:
|
||||
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
test-logical-replication:
|
||||
needs: [ build-build-tools-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init --user root
|
||||
services:
|
||||
clickhouse:
|
||||
image: clickhouse/clickhouse-server:24.6.3.64
|
||||
ports:
|
||||
- 9000:9000
|
||||
- 8123:8123
|
||||
zookeeper:
|
||||
image: quay.io/debezium/zookeeper:2.7
|
||||
ports:
|
||||
- 2181:2181
|
||||
kafka:
|
||||
image: quay.io/debezium/kafka:2.7
|
||||
env:
|
||||
ZOOKEEPER_CONNECT: "zookeeper:2181"
|
||||
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
|
||||
KAFKA_BROKER_ID: 1
|
||||
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
|
||||
KAFKA_JMX_PORT: 9991
|
||||
ports:
|
||||
- 9092:9092
|
||||
debezium:
|
||||
image: quay.io/debezium/connect:2.7
|
||||
env:
|
||||
BOOTSTRAP_SERVERS: kafka:9092
|
||||
GROUP_ID: 1
|
||||
CONFIG_STORAGE_TOPIC: debezium-config
|
||||
OFFSET_STORAGE_TOPIC: debezium-offset
|
||||
STATUS_STORAGE_TOPIC: debezium-status
|
||||
DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
|
||||
ports:
|
||||
- 8083:8083
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
|
||||
- name: Run tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: remote
|
||||
test_selection: logical_repl
|
||||
run_in_parallel: false
|
||||
extra_params: -m remote_cluster
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: always()
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: github.event.schedule && failure()
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
||||
slack-message: |
|
||||
Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
test-postgres-client-libs:
|
||||
needs: [ build-build-tools-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
16
.github/workflows/pin-build-tools-image.yml
vendored
16
.github/workflows/pin-build-tools-image.yml
vendored
@@ -66,22 +66,8 @@ jobs:
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Azure login
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
|
||||
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
|
||||
- name: Login to ACR
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
run: |
|
||||
az acr login --name=neoneastus2
|
||||
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
|
||||
-t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
|
||||
neondatabase/build-tools:${FROM_TAG}
|
||||
|
||||
38
.github/workflows/trigger-e2e-tests.yml
vendored
38
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -13,6 +13,8 @@ defaults:
|
||||
env:
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
jobs:
|
||||
cancel-previous-e2e-tests:
|
||||
@@ -62,35 +64,19 @@ jobs:
|
||||
needs: [ tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
EVENT_ACTION: ${{ github.event.action }}
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
steps:
|
||||
- name: Wait for `promote-images` job to finish
|
||||
# It's important to have a timeout here, the script in the step can run infinitely
|
||||
timeout-minutes: 60
|
||||
- name: check if ecr image are present
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
run: |
|
||||
if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# For PRs we use the run id as the tag
|
||||
BUILD_AND_TEST_RUN_ID=${TAG}
|
||||
while true; do
|
||||
conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
|
||||
case "$conclusion" in
|
||||
success)
|
||||
break
|
||||
;;
|
||||
failure | cancelled | skipped)
|
||||
echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
echo "The 'promote-images' hasn't succeed yet. Waiting..."
|
||||
sleep 60
|
||||
;;
|
||||
esac
|
||||
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
||||
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
||||
if [ "$OUTPUT" == "" ]; then
|
||||
echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Set e2e-platforms
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/storage_controller @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
/pageserver/ @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/proxy/ @neondatabase/proxy
|
||||
/safekeeper/ @neondatabase/storage
|
||||
/safekeeper/ @neondatabase/safekeepers
|
||||
/vendor/ @neondatabase/compute
|
||||
|
||||
193
Cargo.lock
generated
193
Cargo.lock
generated
@@ -1418,7 +1418,7 @@ dependencies = [
|
||||
"clap",
|
||||
"criterion-plot",
|
||||
"is-terminal",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"oorandom",
|
||||
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
|
||||
dependencies = [
|
||||
"cast",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1672,7 +1672,6 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"byteorder",
|
||||
"chrono",
|
||||
"diesel_derives",
|
||||
"itoa",
|
||||
"pq-sys",
|
||||
@@ -2134,12 +2133,6 @@ dependencies = [
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gen_ops"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
|
||||
|
||||
[[package]]
|
||||
name = "generic-array"
|
||||
version = "0.14.7"
|
||||
@@ -2716,6 +2709,17 @@ version = "3.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
|
||||
|
||||
[[package]]
|
||||
name = "io-lifetimes"
|
||||
version = "1.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "io-uring"
|
||||
version = "0.6.2"
|
||||
@@ -2734,13 +2738,14 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
|
||||
|
||||
[[package]]
|
||||
name = "is-terminal"
|
||||
version = "0.4.12"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
|
||||
checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
"io-lifetimes",
|
||||
"rustix 0.37.25",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2752,15 +2757,6 @@ dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.6"
|
||||
@@ -2875,6 +2871,18 @@ version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.3.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.13"
|
||||
@@ -2992,7 +3000,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"measured",
|
||||
"procfs",
|
||||
"procfs 0.16.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3037,7 +3045,7 @@ dependencies = [
|
||||
"measured",
|
||||
"measured-process",
|
||||
"once_cell",
|
||||
"procfs",
|
||||
"procfs 0.14.2",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
@@ -3566,7 +3574,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"leaky-bucket",
|
||||
"md5",
|
||||
"metrics",
|
||||
@@ -3584,9 +3592,8 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"procfs 0.14.2",
|
||||
"rand 0.8.5",
|
||||
"range-set-blaze",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest 0.12.4",
|
||||
@@ -3637,7 +3644,7 @@ dependencies = [
|
||||
"hex",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
@@ -3695,7 +3702,7 @@ dependencies = [
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
@@ -4027,7 +4034,7 @@ name = "postgres_connection"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"tokio-postgres",
|
||||
@@ -4085,7 +4092,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"rand 0.8.5",
|
||||
@@ -4131,6 +4138,21 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "procfs"
|
||||
version = "0.14.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"byteorder",
|
||||
"chrono",
|
||||
"flate2",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"rustix 0.36.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "procfs"
|
||||
version = "0.16.0"
|
||||
@@ -4138,12 +4160,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"chrono",
|
||||
"flate2",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"procfs-core",
|
||||
"rustix",
|
||||
"rustix 0.38.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4153,15 +4173,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"chrono",
|
||||
"hex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prometheus"
|
||||
version = "0.13.4"
|
||||
version = "0.13.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
|
||||
checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"fnv",
|
||||
@@ -4169,7 +4188,7 @@ dependencies = [
|
||||
"libc",
|
||||
"memchr",
|
||||
"parking_lot 0.12.1",
|
||||
"procfs",
|
||||
"procfs 0.14.2",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
@@ -4191,7 +4210,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck 0.4.1",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -4212,7 +4231,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
@@ -4269,7 +4288,7 @@ dependencies = [
|
||||
"hyper-util",
|
||||
"indexmap 2.0.1",
|
||||
"ipnet",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"lasso",
|
||||
"md5",
|
||||
"measured",
|
||||
@@ -4324,7 +4343,6 @@ dependencies = [
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"try-lock",
|
||||
"typed-json",
|
||||
"url",
|
||||
"urlencoding",
|
||||
@@ -4446,18 +4464,6 @@ dependencies = [
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "range-set-blaze"
|
||||
version = "0.1.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
|
||||
dependencies = [
|
||||
"gen_ops",
|
||||
"itertools 0.12.1",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.7.0"
|
||||
@@ -4626,7 +4632,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
@@ -4936,6 +4942,34 @@ dependencies = [
|
||||
"nom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.36.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"errno",
|
||||
"io-lifetimes",
|
||||
"libc",
|
||||
"linux-raw-sys 0.1.4",
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.37.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"errno",
|
||||
"io-lifetimes",
|
||||
"libc",
|
||||
"linux-raw-sys 0.3.8",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.28"
|
||||
@@ -5684,7 +5718,6 @@ dependencies = [
|
||||
"aws-config",
|
||||
"bytes",
|
||||
"camino",
|
||||
"chrono",
|
||||
"clap",
|
||||
"control_plane",
|
||||
"diesel",
|
||||
@@ -5695,7 +5728,7 @@ dependencies = [
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
@@ -5704,7 +5737,6 @@ dependencies = [
|
||||
"pageserver_client",
|
||||
"postgres_connection",
|
||||
"r2d2",
|
||||
"rand 0.8.5",
|
||||
"reqwest 0.12.4",
|
||||
"routerify",
|
||||
"scopeguard",
|
||||
@@ -5760,10 +5792,9 @@ dependencies = [
|
||||
"either",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"once_cell",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
@@ -5940,15 +5971,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.9.0"
|
||||
version = "3.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
|
||||
checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"fastrand 2.0.0",
|
||||
"redox_syscall 0.4.1",
|
||||
"rustix",
|
||||
"windows-sys 0.52.0",
|
||||
"fastrand 1.9.0",
|
||||
"redox_syscall 0.3.5",
|
||||
"rustix 0.37.25",
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6564,9 +6595,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "try-lock"
|
||||
version = "0.2.5"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
||||
checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
@@ -7145,6 +7176,15 @@ dependencies = [
|
||||
"windows_x86_64_msvc 0.42.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.45.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
|
||||
dependencies = [
|
||||
"windows-targets 0.42.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.48.0"
|
||||
@@ -7163,6 +7203,21 @@ dependencies = [
|
||||
"windows-targets 0.52.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm 0.42.2",
|
||||
"windows_aarch64_msvc 0.42.2",
|
||||
"windows_i686_gnu 0.42.2",
|
||||
"windows_i686_msvc 0.42.2",
|
||||
"windows_x86_64_gnu 0.42.2",
|
||||
"windows_x86_64_gnullvm 0.42.2",
|
||||
"windows_x86_64_msvc 0.42.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.48.0"
|
||||
@@ -7392,7 +7447,7 @@ dependencies = [
|
||||
"hmac",
|
||||
"hyper 0.14.26",
|
||||
"indexmap 1.9.3",
|
||||
"itertools 0.10.5",
|
||||
"itertools",
|
||||
"libc",
|
||||
"log",
|
||||
"memchr",
|
||||
|
||||
@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "51.0.0"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
procfs = "0.16"
|
||||
procfs = "0.14"
|
||||
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
@@ -184,7 +184,6 @@ tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.21.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
try-lock = "0.2.5"
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
typed-json = "0.1"
|
||||
url = "2.2"
|
||||
|
||||
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.80.0
|
||||
ENV RUSTC_VERSION=1.79.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
|
||||
@@ -933,8 +933,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
|
||||
#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
|
||||
COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
|
||||
COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
|
||||
COPY --from=rum-pg-build /rum.tar.gz /ext-src
|
||||
COPY patches/rum.patch /ext-src
|
||||
#COPY --from=rum-pg-build /rum.tar.gz /ext-src
|
||||
#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
|
||||
COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
|
||||
COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
|
||||
@@ -946,7 +945,7 @@ COPY patches/pg_hintplan.patch /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
|
||||
COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
|
||||
COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
|
||||
COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
|
||||
COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
|
||||
@@ -961,7 +960,6 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
|
||||
rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
|
||||
|| exit 1; rm -f $f; done
|
||||
RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
||||
# cmake is required for the h3 test
|
||||
RUN apt-get update && apt-get install -y cmake
|
||||
RUN patch -p1 < /ext-src/pg_hintplan.patch
|
||||
|
||||
@@ -4,11 +4,6 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test specific features.
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
|
||||
@@ -400,15 +400,7 @@ impl ComputeNode {
|
||||
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let mut retry_period_ms = 500.0;
|
||||
let mut attempts = 0;
|
||||
const DEFAULT_ATTEMPTS: u16 = 10;
|
||||
#[cfg(feature = "testing")]
|
||||
let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
|
||||
u16::from_str(&v).unwrap()
|
||||
} else {
|
||||
DEFAULT_ATTEMPTS
|
||||
};
|
||||
#[cfg(not(feature = "testing"))]
|
||||
let max_attempts = DEFAULT_ATTEMPTS;
|
||||
let max_attempts = 10;
|
||||
loop {
|
||||
let result = self.try_get_basebackup(compute_state, lsn);
|
||||
match result {
|
||||
|
||||
@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
|
||||
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
|
||||
for (var, val) in std::env::vars() {
|
||||
if var.starts_with("NEON_") {
|
||||
if var.starts_with("NEON_PAGESERVER_") {
|
||||
cmd = cmd.env(var, val);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -514,6 +514,7 @@ impl LocalEnv {
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
// (allow unknown fields, unlike PageServerConf)
|
||||
struct PageserverConfigTomlSubset {
|
||||
id: NodeId,
|
||||
listen_pg_addr: String,
|
||||
listen_http_addr: String,
|
||||
pg_auth_type: AuthType,
|
||||
@@ -525,30 +526,18 @@ impl LocalEnv {
|
||||
.with_context(|| format!("read {:?}", config_toml_path))?,
|
||||
)
|
||||
.context("parse pageserver.toml")?;
|
||||
let identity_toml_path = dentry.path().join("identity.toml");
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
struct IdentityTomlSubset {
|
||||
id: NodeId,
|
||||
}
|
||||
let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
|
||||
&std::fs::read_to_string(&identity_toml_path)
|
||||
.with_context(|| format!("read {:?}", identity_toml_path))?,
|
||||
)
|
||||
.context("parse identity.toml")?;
|
||||
let PageserverConfigTomlSubset {
|
||||
id: config_toml_id,
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
} = config_toml;
|
||||
let IdentityTomlSubset {
|
||||
id: identity_toml_id,
|
||||
} = identity_toml;
|
||||
let conf = PageServerConf {
|
||||
id: {
|
||||
anyhow::ensure!(
|
||||
identity_toml_id == id,
|
||||
"id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
|
||||
config_toml_id == id,
|
||||
"id mismatch: config_toml.id={config_toml_id} id={id}",
|
||||
);
|
||||
id
|
||||
},
|
||||
|
||||
@@ -127,13 +127,10 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
// Apply the user-provided overrides
|
||||
overrides.push({
|
||||
let mut doc =
|
||||
toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
|
||||
// `id` is written out to `identity.toml` instead of `pageserver.toml`
|
||||
doc.remove("id").expect("it's part of the struct");
|
||||
doc.to_string()
|
||||
});
|
||||
overrides.push(
|
||||
toml_edit::ser::to_string_pretty(&conf)
|
||||
.expect("we deserialized this from toml earlier"),
|
||||
);
|
||||
|
||||
// Turn `overrides` into a toml document.
|
||||
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
|
||||
|
||||
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
|
||||
rm -rf $TMPDIR
|
||||
# We are running tests now
|
||||
if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
$TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
|
||||
then
|
||||
cleanup
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
cd /ext-src || exit 2
|
||||
cd /ext-src
|
||||
FAILED=
|
||||
LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||
LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
|
||||
for d in ${LIST}
|
||||
do
|
||||
[ -d "${d}" ] || continue
|
||||
[ -d ${d} ] || continue
|
||||
psql -c "select 1" >/dev/null || break
|
||||
USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
|
||||
make -C ${d} installcheck || FAILED="${d} ${FAILED}"
|
||||
done
|
||||
[ -z "${FAILED}" ] && exit 0
|
||||
echo "${FAILED}"
|
||||
echo ${FAILED}
|
||||
exit 1
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::collections::HashSet;
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::Instant;
|
||||
|
||||
/// Request/response types for the storage controller
|
||||
/// API (`/control/v1` prefix). Implemented by the server
|
||||
@@ -295,42 +294,6 @@ pub enum PlacementPolicy {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
|
||||
/// Metadata health record posted from scrubber.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct MetadataHealthRecord {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub healthy: bool,
|
||||
pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct MetadataHealthUpdateRequest {
|
||||
pub healthy_tenant_shards: HashSet<TenantShardId>,
|
||||
pub unhealthy_tenant_shards: HashSet<TenantShardId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct MetadataHealthUpdateResponse {}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListUnhealthyResponse {
|
||||
pub unhealthy_tenant_shards: Vec<TenantShardId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListOutdatedRequest {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub not_scrubbed_for: Duration,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListOutdatedResponse {
|
||||
pub health_records: Vec<MetadataHealthRecord>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
@@ -107,10 +107,7 @@ impl Key {
|
||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||
pub fn to_i128(&self) -> i128 {
|
||||
assert!(
|
||||
self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
|
||||
"invalid key: {self}",
|
||||
);
|
||||
assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
|
||||
(((self.field1 & 0x7F) as i128) << 120)
|
||||
| (((self.field2 & 0xFFFF) as i128) << 104)
|
||||
| ((self.field3 as i128) << 72)
|
||||
|
||||
@@ -637,13 +637,6 @@ pub struct TenantInfo {
|
||||
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
|
||||
pub attachment_status: TenantAttachmentStatus,
|
||||
pub generation: u32,
|
||||
|
||||
/// Opaque explanation if gc is being blocked.
|
||||
///
|
||||
/// Only looked up for the individual tenant detail, not the listing. This is purely for
|
||||
/// debugging, not included in openapi.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_blocking: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -947,8 +940,6 @@ pub struct TopTenantShardsResponse {
|
||||
}
|
||||
|
||||
pub mod virtual_file {
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(
|
||||
Copy,
|
||||
Clone,
|
||||
@@ -967,53 +958,6 @@ pub mod virtual_file {
|
||||
#[cfg(target_os = "linux")]
|
||||
TokioEpollUring,
|
||||
}
|
||||
|
||||
/// Direct IO modes for a pageserver.
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum DirectIoMode {
|
||||
/// Direct IO disabled (uses usual buffered IO).
|
||||
#[default]
|
||||
Disabled,
|
||||
/// Direct IO disabled (performs checks and perf simulations).
|
||||
Evaluate {
|
||||
/// Alignment check level
|
||||
alignment_check: DirectIoAlignmentCheckLevel,
|
||||
/// Latency padded for performance simulation.
|
||||
latency_padding: DirectIoLatencyPadding,
|
||||
},
|
||||
/// Direct IO enabled.
|
||||
Enabled {
|
||||
/// Actions to perform on alignment error.
|
||||
on_alignment_error: DirectIoOnAlignmentErrorAction,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum DirectIoAlignmentCheckLevel {
|
||||
#[default]
|
||||
Error,
|
||||
Log,
|
||||
None,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum DirectIoOnAlignmentErrorAction {
|
||||
Error,
|
||||
#[default]
|
||||
FallbackToBuffered,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(tag = "type", rename_all = "kebab-case")]
|
||||
pub enum DirectIoLatencyPadding {
|
||||
/// Pad virtual file operations with IO to a fake file.
|
||||
FakeFileRW { path: PathBuf },
|
||||
#[default]
|
||||
None,
|
||||
}
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -1483,7 +1427,6 @@ mod tests {
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: 1,
|
||||
gc_blocking: None,
|
||||
};
|
||||
let expected_active = json!({
|
||||
"id": original_active.id.to_string(),
|
||||
@@ -1506,7 +1449,6 @@ mod tests {
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: 1,
|
||||
gc_blocking: None,
|
||||
};
|
||||
let expected_broken = json!({
|
||||
"id": original_broken.id.to_string(),
|
||||
|
||||
@@ -29,7 +29,7 @@ use anyhow::{bail, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
|
||||
/// Equivalent to sizeof(ControlFileData) in C
|
||||
const SIZEOF_CONTROLDATA: usize = size_of::<ControlFileData>();
|
||||
const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
|
||||
|
||||
impl ControlFileData {
|
||||
/// Compute the offset of the `crc` field within the `ControlFileData` struct.
|
||||
|
||||
@@ -31,7 +31,7 @@ pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
|
||||
//
|
||||
|
||||
// Assumes 8 byte alignment
|
||||
const SIZEOF_PAGE_HEADER_DATA: usize = size_of::<PageHeaderData>();
|
||||
const SIZEOF_PAGE_HEADER_DATA: usize = std::mem::size_of::<PageHeaderData>();
|
||||
pub const MAXALIGN_SIZE_OF_PAGE_HEADER_DATA: usize = (SIZEOF_PAGE_HEADER_DATA + 7) & !7;
|
||||
|
||||
//
|
||||
@@ -191,7 +191,7 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
|
||||
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
|
||||
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
|
||||
|
||||
pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
|
||||
pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::<XLogRecord>() as u32;
|
||||
|
||||
//
|
||||
// from xlogrecord.h
|
||||
|
||||
@@ -42,9 +42,9 @@ pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
|
||||
pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
|
||||
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
|
||||
|
||||
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = size_of::<XLogPageHeaderData>();
|
||||
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = size_of::<XLogLongPageHeaderData>();
|
||||
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = size_of::<XLogRecord>();
|
||||
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
|
||||
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
|
||||
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
|
||||
#[allow(clippy::identity_op)]
|
||||
pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
|
||||
|
||||
@@ -311,7 +311,7 @@ impl XLogLongPageHeaderData {
|
||||
}
|
||||
}
|
||||
|
||||
pub const SIZEOF_CHECKPOINT: usize = size_of::<CheckPoint>();
|
||||
pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();
|
||||
|
||||
impl CheckPoint {
|
||||
pub fn encode(&self) -> Result<Bytes, SerializeError> {
|
||||
|
||||
@@ -178,7 +178,7 @@ pub fn test_find_end_of_wal_last_crossing_segment() {
|
||||
/// currently 1024.
|
||||
#[test]
|
||||
pub fn test_update_next_xid() {
|
||||
let checkpoint_buf = [0u8; size_of::<CheckPoint>()];
|
||||
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
|
||||
|
||||
checkpoint.nextXid = FullTransactionId { value: 10 };
|
||||
@@ -204,7 +204,7 @@ pub fn test_update_next_xid() {
|
||||
|
||||
#[test]
|
||||
pub fn test_update_next_multixid() {
|
||||
let checkpoint_buf = [0u8; size_of::<CheckPoint>()];
|
||||
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
|
||||
|
||||
// simple case
|
||||
|
||||
@@ -355,8 +355,7 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
.blobs()
|
||||
.map(|k| ListingObject{
|
||||
key: self.name_to_relative_path(&k.name),
|
||||
last_modified: k.properties.last_modified.into(),
|
||||
size: k.properties.content_length,
|
||||
last_modified: k.properties.last_modified.into()
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
@@ -144,7 +144,6 @@ impl RemotePath {
|
||||
///
|
||||
/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
|
||||
/// NoDelimiter mode will only populate `keys`.
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum ListingMode {
|
||||
WithDelimiter,
|
||||
NoDelimiter,
|
||||
@@ -154,7 +153,6 @@ pub enum ListingMode {
|
||||
pub struct ListingObject {
|
||||
pub key: RemotePath,
|
||||
pub last_modified: SystemTime,
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -196,7 +194,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>>;
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -353,10 +351,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &'a CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
|
||||
match self {
|
||||
Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
|
||||
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
|
||||
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
|
||||
Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||
Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||
Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||
|
||||
@@ -368,7 +368,6 @@ impl RemoteStorage for LocalFs {
|
||||
key: k.clone(),
|
||||
// LocalFs is just for testing, so just specify a dummy time
|
||||
last_modified: SystemTime::now(),
|
||||
size: 0,
|
||||
})
|
||||
}
|
||||
})
|
||||
@@ -412,7 +411,6 @@ impl RemoteStorage for LocalFs {
|
||||
key: RemotePath::from_string(&relative_key).unwrap(),
|
||||
// LocalFs is just for testing
|
||||
last_modified: SystemTime::now(),
|
||||
size: 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -565,12 +565,9 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
};
|
||||
|
||||
let size = object.size.unwrap_or(0) as u64;
|
||||
|
||||
result.keys.push(ListingObject{
|
||||
key,
|
||||
last_modified,
|
||||
size,
|
||||
last_modified
|
||||
});
|
||||
if let Some(mut mk) = max_keys {
|
||||
assert!(mk > 0);
|
||||
|
||||
@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> {
|
||||
async_stream::stream! {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum Scope {
|
||||
/// Provides access to all data for a specific tenant (specified in `struct Claims` below)
|
||||
// Provides access to all data for a specific tenant (specified in `struct Claims` below)
|
||||
// TODO: join these two?
|
||||
Tenant,
|
||||
/// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
|
||||
/// Should only be used e.g. for status check/tenant creation/list.
|
||||
// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
|
||||
// Should only be used e.g. for status check/tenant creation/list.
|
||||
PageServerApi,
|
||||
/// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
|
||||
/// Should only be used e.g. for status check.
|
||||
/// Currently also used for connection from any pageserver to any safekeeper.
|
||||
// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
|
||||
// Should only be used e.g. for status check.
|
||||
// Currently also used for connection from any pageserver to any safekeeper.
|
||||
SafekeeperData,
|
||||
/// The scope used by pageservers in upcalls to storage controller and cloud control plane
|
||||
// The scope used by pageservers in upcalls to storage controller and cloud control plane
|
||||
#[serde(rename = "generations_api")]
|
||||
GenerationsApi,
|
||||
/// Allows access to control plane managment API and some storage controller endpoints.
|
||||
// Allows access to control plane managment API and some storage controller endpoints.
|
||||
Admin,
|
||||
|
||||
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state
|
||||
|
||||
@@ -8,10 +8,33 @@ pub struct Completion {
|
||||
_token: TaskTrackerToken,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Completion {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Completion")
|
||||
.field("siblings", &self._token.task_tracker().len())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Completion {
|
||||
/// Returns true if this completion is associated with the given barrier.
|
||||
pub fn blocks(&self, barrier: &Barrier) -> bool {
|
||||
TaskTracker::ptr_eq(self._token.task_tracker(), &barrier.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Barrier will wait until all clones of [`Completion`] have been dropped.
|
||||
#[derive(Clone)]
|
||||
pub struct Barrier(TaskTracker);
|
||||
|
||||
impl std::fmt::Debug for Barrier {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Barrier")
|
||||
.field("remaining", &self.0.len())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Barrier {
|
||||
fn default() -> Self {
|
||||
let (_, rx) = channel();
|
||||
|
||||
@@ -78,9 +78,8 @@ impl Drop for GateGuard {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[derive(Debug)]
|
||||
pub enum GateError {
|
||||
#[error("gate is closed")]
|
||||
GateClosed,
|
||||
}
|
||||
|
||||
|
||||
@@ -49,7 +49,6 @@ postgres_backend.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
@@ -108,7 +107,3 @@ harness = false
|
||||
[[bench]]
|
||||
name = "bench_walredo"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "bench_ingest"
|
||||
harness = false
|
||||
|
||||
@@ -1,239 +0,0 @@
|
||||
use std::{env, num::NonZeroUsize};
|
||||
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
||||
page_cache,
|
||||
repository::Value,
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::InMemoryLayer,
|
||||
virtual_file,
|
||||
};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
// A very cheap hash for generating non-sequential keys.
|
||||
fn murmurhash32(mut h: u32) -> u32 {
|
||||
h ^= h >> 16;
|
||||
h = h.wrapping_mul(0x85ebca6b);
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(0xc2b2ae35);
|
||||
h ^= h >> 16;
|
||||
h
|
||||
}
|
||||
|
||||
enum KeyLayout {
|
||||
/// Sequential unique keys
|
||||
Sequential,
|
||||
/// Random unique keys
|
||||
Random,
|
||||
/// Random keys, but only use the bits from the mask of them
|
||||
RandomReuse(u32),
|
||||
}
|
||||
|
||||
enum WriteDelta {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
async fn ingest(
|
||||
conf: &'static PageServerConf,
|
||||
put_size: usize,
|
||||
put_count: usize,
|
||||
key_layout: KeyLayout,
|
||||
write_delta: WriteDelta,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut lsn = utils::lsn::Lsn(1000);
|
||||
let mut key = Key::from_i128(0x0);
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
let entered = gate.enter().unwrap();
|
||||
|
||||
let layer =
|
||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
|
||||
|
||||
let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
|
||||
let ctx = RequestContext::new(
|
||||
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
|
||||
pageserver::context::DownloadBehavior::Download,
|
||||
);
|
||||
|
||||
for i in 0..put_count {
|
||||
lsn += put_size as u64;
|
||||
|
||||
// Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
|
||||
// usually care the most about write performance when they're blasting a huge batch of data into a huge table.
|
||||
match key_layout {
|
||||
KeyLayout::Sequential => {
|
||||
// Use sequential order to illustrate the experience a user is likely to have
|
||||
// when ingesting bulk data.
|
||||
key.field6 = i as u32;
|
||||
}
|
||||
KeyLayout::Random => {
|
||||
// Use random-order keys to avoid giving a false advantage to data structures that are
|
||||
// faster when inserting on the end.
|
||||
key.field6 = murmurhash32(i as u32);
|
||||
}
|
||||
KeyLayout::RandomReuse(mask) => {
|
||||
// Use low bits only, to limit cardinality
|
||||
key.field6 = murmurhash32(i as u32) & mask;
|
||||
}
|
||||
}
|
||||
|
||||
layer.put_value(key, lsn, &data, &ctx).await?;
|
||||
}
|
||||
layer.freeze(lsn + 1).await;
|
||||
|
||||
if matches!(write_delta, WriteDelta::Yes) {
|
||||
let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
|
||||
max_concurrency: NonZeroUsize::new(1).unwrap(),
|
||||
});
|
||||
let (_desc, path) = layer
|
||||
.write_to_disk(&ctx, None, l0_flush_state.inner())
|
||||
.await?
|
||||
.unwrap();
|
||||
tokio::fs::remove_file(path).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wrapper to instantiate a tokio runtime
|
||||
fn ingest_main(
|
||||
conf: &'static PageServerConf,
|
||||
put_size: usize,
|
||||
put_count: usize,
|
||||
key_layout: KeyLayout,
|
||||
write_delta: WriteDelta,
|
||||
) {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
runtime.block_on(async move {
|
||||
let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
|
||||
if let Err(e) = r {
|
||||
panic!("{e:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Declare a series of benchmarks for the Pageserver's ingest write path.
|
||||
///
|
||||
/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
|
||||
/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
|
||||
///
|
||||
/// Genuine disk I/O is used, so expect results to differ depending on storage. However, when running on
|
||||
/// a fast disk, CPU is the bottleneck at time of writing.
|
||||
fn criterion_benchmark(c: &mut Criterion) {
|
||||
let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
|
||||
let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
|
||||
eprintln!("Data directory: {}", temp_dir.path());
|
||||
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(
|
||||
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
|
||||
));
|
||||
virtual_file::init(16384, virtual_file::io_engine_for_bench());
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
{
|
||||
let mut group = c.benchmark_group("ingest-small-values");
|
||||
let put_size = 100usize;
|
||||
let put_count = 128 * 1024 * 1024 / put_size;
|
||||
group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
|
||||
group.sample_size(10);
|
||||
group.bench_function("ingest 128MB/100b seq", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/100b rand", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Random,
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::RandomReuse(0x3ff),
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/100b seq, no delta", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::No,
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
let mut group = c.benchmark_group("ingest-big-values");
|
||||
let put_size = 8192usize;
|
||||
let put_count = 128 * 1024 * 1024 / put_size;
|
||||
group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
|
||||
group.sample_size(10);
|
||||
group.bench_function("ingest 128MB/8k seq", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/8k seq, no delta", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::No,
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_main!(benches);
|
||||
@@ -1,4 +1,3 @@
|
||||
use criterion::measurement::WallTime;
|
||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
@@ -16,11 +15,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
|
||||
|
||||
fn fixture_path(relative: &str) -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
|
||||
}
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
|
||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
|
||||
let mut layer_map = LayerMap::default();
|
||||
@@ -114,7 +109,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
|
||||
// between each test run.
|
||||
fn bench_from_captest_env(c: &mut Criterion) {
|
||||
// TODO consider compressing this file
|
||||
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
|
||||
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
|
||||
|
||||
// Test with uniform query pattern
|
||||
@@ -144,7 +139,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
|
||||
fn bench_from_real_project(c: &mut Criterion) {
|
||||
// Init layer map
|
||||
let now = Instant::now();
|
||||
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
|
||||
println!("Finished layer map init in {:?}", now.elapsed());
|
||||
|
||||
// Choose uniformly distributed queries
|
||||
@@ -247,72 +242,7 @@ fn bench_sequential(c: &mut Criterion) {
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_visibility_with_map(
|
||||
group: &mut BenchmarkGroup<WallTime>,
|
||||
layer_map: LayerMap,
|
||||
read_points: Vec<Lsn>,
|
||||
bench_name: &str,
|
||||
) {
|
||||
group.bench_function(bench_name, |b| {
|
||||
b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
|
||||
});
|
||||
}
|
||||
|
||||
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
|
||||
fn bench_visibility(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("visibility");
|
||||
{
|
||||
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
|
||||
let now = Instant::now();
|
||||
let mut layer_map = LayerMap::default();
|
||||
let mut updates = layer_map.batch_update();
|
||||
for i in 0..100_000 {
|
||||
let i32 = (i as u32) % 100;
|
||||
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||
let layer = PersistentLayerDesc::new_img(
|
||||
TenantShardId::unsharded(TenantId::generate()),
|
||||
TimelineId::generate(),
|
||||
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||
Lsn(i),
|
||||
0,
|
||||
);
|
||||
updates.insert_historic(layer);
|
||||
}
|
||||
updates.flush();
|
||||
println!("Finished layer map init in {:?}", now.elapsed());
|
||||
|
||||
let mut read_points = Vec::new();
|
||||
for i in (0..100_000).step_by(1000) {
|
||||
read_points.push(Lsn(i));
|
||||
}
|
||||
|
||||
bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
|
||||
}
|
||||
|
||||
{
|
||||
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||
let read_points = vec![Lsn(0x1C760FA190)];
|
||||
bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
|
||||
|
||||
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||
let read_points = vec![
|
||||
Lsn(0x1C760FA190),
|
||||
Lsn(0x000000931BEAD539),
|
||||
Lsn(0x000000931BF63011),
|
||||
Lsn(0x000000931B33AE68),
|
||||
Lsn(0x00000038E67ABFA0),
|
||||
Lsn(0x000000931B33AE68),
|
||||
Lsn(0x000000914E3F38F0),
|
||||
Lsn(0x000000931B33AE68),
|
||||
];
|
||||
bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(group_1, bench_from_captest_env);
|
||||
criterion_group!(group_2, bench_from_real_project);
|
||||
criterion_group!(group_3, bench_sequential);
|
||||
criterion_group!(group_4, bench_visibility);
|
||||
criterion_main!(group_1, group_2, group_3, group_4);
|
||||
criterion_main!(group_1, group_2, group_3);
|
||||
|
||||
@@ -17,9 +17,11 @@ use pageserver::config::PageserverIdentity;
|
||||
use pageserver::control_plane_client::ControlPlaneClient;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use pageserver::tenant::{secondary, TenantSharedResources};
|
||||
use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
|
||||
use pageserver::{
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio::time::Instant;
|
||||
@@ -29,9 +31,11 @@ use tracing::*;
|
||||
use metrics::set_build_info_metric;
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
deletion_queue::DeletionQueue,
|
||||
http, page_cache, page_service, task_mgr,
|
||||
task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
|
||||
tenant::mgr,
|
||||
virtual_file,
|
||||
};
|
||||
@@ -123,10 +127,8 @@ fn main() -> anyhow::Result<()> {
|
||||
|
||||
// after setting up logging, log the effective IO engine choice and read path implementations
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
|
||||
info!(?conf.get_impl, "starting with get page implementation");
|
||||
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
|
||||
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
||||
|
||||
let tenants_path = conf.tenants_path();
|
||||
if !tenants_path.exists() {
|
||||
@@ -591,13 +593,30 @@ fn start_pageserver(
|
||||
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
// for each connection. We created the listener earlier already.
|
||||
let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
|
||||
let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
|
||||
pageserver_listener
|
||||
.set_nonblocking(true)
|
||||
.context("set listener to nonblocking")?;
|
||||
tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
|
||||
});
|
||||
let libpq_listener = {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
TaskKind::LibpqEndpointListener,
|
||||
// listener task shouldn't need to download anything. (We will
|
||||
// create a separate sub-contexts for each connection, with their
|
||||
// own download behavior. This context is used only to listen and
|
||||
// accept connections.)
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"libpq listener",
|
||||
page_service::libpq_listener_main(
|
||||
tenant_manager.clone(),
|
||||
pg_auth,
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
),
|
||||
));
|
||||
LibpqEndpointListener(CancellableTask { task, cancel })
|
||||
};
|
||||
|
||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||
|
||||
@@ -625,7 +644,7 @@ fn start_pageserver(
|
||||
shutdown_pageserver.take();
|
||||
pageserver::shutdown_pageserver(
|
||||
http_endpoint_listener,
|
||||
page_service,
|
||||
libpq_listener,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
&tenant_manager,
|
||||
|
||||
@@ -29,7 +29,6 @@ use utils::{
|
||||
logging::LogFormat,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
|
||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -53,7 +52,7 @@ pub mod defaults {
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
|
||||
@@ -84,16 +83,16 @@ pub mod defaults {
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
||||
|
||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored";
|
||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
||||
|
||||
pub const DEFAULT_GET_IMPL: &str = "vectored";
|
||||
pub const DEFAULT_GET_IMPL: &str = "legacy";
|
||||
|
||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||
ImageCompressionAlgorithm::Disabled;
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
@@ -296,13 +295,6 @@ pub struct PageServerConf {
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
|
||||
pub l0_flush: L0FlushConfig,
|
||||
|
||||
/// This flag is temporary and will be removed after gradual rollout.
|
||||
/// See <https://github.com/neondatabase/neon/issues/8184>.
|
||||
pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
|
||||
|
||||
/// Direct IO settings
|
||||
pub virtual_file_direct_io: virtual_file::DirectIoMode,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -364,6 +356,8 @@ struct PageServerConfigBuilder {
|
||||
auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
|
||||
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
|
||||
|
||||
id: BuilderValue<NodeId>,
|
||||
|
||||
broker_endpoint: BuilderValue<Uri>,
|
||||
broker_keepalive_interval: BuilderValue<Duration>,
|
||||
|
||||
@@ -409,15 +403,14 @@ struct PageServerConfigBuilder {
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
|
||||
l0_flush: BuilderValue<L0FlushConfig>,
|
||||
|
||||
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
|
||||
|
||||
virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
fn new() -> Self {
|
||||
Self::default()
|
||||
fn new(node_id: NodeId) -> Self {
|
||||
let mut this = Self::default();
|
||||
this.id(node_id);
|
||||
|
||||
this
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@@ -445,6 +438,7 @@ impl PageServerConfigBuilder {
|
||||
pg_auth_type: Set(AuthType::Trust),
|
||||
auth_validation_public_key_path: Set(None),
|
||||
remote_storage_config: Set(None),
|
||||
id: NotSet,
|
||||
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
|
||||
.parse()
|
||||
.expect("failed to parse default broker endpoint")),
|
||||
@@ -502,8 +496,6 @@ impl PageServerConfigBuilder {
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: Set(L0FlushConfig::default()),
|
||||
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
||||
virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -576,6 +568,10 @@ impl PageServerConfigBuilder {
|
||||
self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
|
||||
}
|
||||
|
||||
pub fn id(&mut self, node_id: NodeId) {
|
||||
self.id = BuilderValue::Set(node_id)
|
||||
}
|
||||
|
||||
pub fn log_format(&mut self, log_format: LogFormat) {
|
||||
self.log_format = BuilderValue::Set(log_format)
|
||||
}
|
||||
@@ -687,15 +683,7 @@ impl PageServerConfigBuilder {
|
||||
self.l0_flush = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
|
||||
self.compact_level0_phase1_value_access = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
|
||||
self.virtual_file_direct_io = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
macro_rules! conf {
|
||||
@@ -728,6 +716,7 @@ impl PageServerConfigBuilder {
|
||||
pg_auth_type,
|
||||
auth_validation_public_key_path,
|
||||
remote_storage_config,
|
||||
id,
|
||||
broker_endpoint,
|
||||
broker_keepalive_interval,
|
||||
log_format,
|
||||
@@ -752,12 +741,9 @@ impl PageServerConfigBuilder {
|
||||
image_compression,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
l0_flush,
|
||||
compact_level0_phase1_value_access,
|
||||
virtual_file_direct_io,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
id: id,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new({
|
||||
@@ -907,7 +893,7 @@ impl PageServerConf {
|
||||
toml: &Document,
|
||||
workdir: &Utf8Path,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut builder = PageServerConfigBuilder::new();
|
||||
let mut builder = PageServerConfigBuilder::new(node_id);
|
||||
builder.workdir(workdir.to_owned());
|
||||
|
||||
let mut t_conf = TenantConfOpt::default();
|
||||
@@ -938,6 +924,8 @@ impl PageServerConf {
|
||||
"tenant_config" => {
|
||||
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
|
||||
}
|
||||
"id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
|
||||
// Logging is not set up yet, so we can't do it.
|
||||
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
||||
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
|
||||
"log_format" => builder.log_format(
|
||||
@@ -1026,17 +1014,11 @@ impl PageServerConf {
|
||||
"l0_flush" => {
|
||||
builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
|
||||
}
|
||||
"compact_level0_phase1_value_access" => {
|
||||
builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
|
||||
}
|
||||
"virtual_file_direct_io" => {
|
||||
builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
|
||||
let mut conf = builder.build(node_id).context("invalid config")?;
|
||||
let mut conf = builder.build().context("invalid config")?;
|
||||
|
||||
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
|
||||
let auth_validation_public_key_path = conf
|
||||
@@ -1116,8 +1098,6 @@ impl PageServerConf {
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1275,6 +1255,7 @@ max_file_descriptors = 333
|
||||
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
initial_superuser_name = 'zzzz'
|
||||
id = 10
|
||||
|
||||
metric_collection_interval = '222 s'
|
||||
metric_collection_endpoint = 'http://localhost:80/metrics'
|
||||
@@ -1291,8 +1272,9 @@ background_task_maximum_delay = '334 s'
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
|
||||
// we have to create dummy values to overcome the validation errors
|
||||
let config_string =
|
||||
format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
|
||||
let config_string = format!(
|
||||
"pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
|
||||
);
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
|
||||
@@ -1359,8 +1341,6 @@ background_task_maximum_delay = '334 s'
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1435,8 +1415,6 @@ background_task_maximum_delay = '334 s'
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -1601,6 +1579,7 @@ broker_endpoint = '{broker_endpoint}'
|
||||
r#"pg_distrib_dir = "{pg_distrib_dir}"
|
||||
metric_collection_endpoint = "http://sample.url"
|
||||
metric_collection_interval = "10min"
|
||||
id = 222
|
||||
|
||||
[disk_usage_based_eviction]
|
||||
max_usage_pct = 80
|
||||
@@ -1670,6 +1649,7 @@ threshold = "20m"
|
||||
r#"pg_distrib_dir = "{pg_distrib_dir}"
|
||||
metric_collection_endpoint = "http://sample.url"
|
||||
metric_collection_interval = "10min"
|
||||
id = 222
|
||||
|
||||
[tenant_config]
|
||||
evictions_low_residence_duration_metric_threshold = "20m"
|
||||
|
||||
@@ -308,45 +308,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Persistently add a gc blocking at the tenant level because of this timeline
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Persistently remove a tenant level gc blocking for this timeline
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/location_config:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
@@ -932,7 +893,7 @@ components:
|
||||
description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything.
|
||||
ArchivalConfigRequest:
|
||||
type: object
|
||||
required:
|
||||
required
|
||||
- state
|
||||
properties:
|
||||
state:
|
||||
|
||||
@@ -296,11 +296,6 @@ impl From<GetActiveTenantError> for ApiError {
|
||||
GetActiveTenantError::WaitForActiveTimeout { .. } => {
|
||||
ApiError::ResourceUnavailable(format!("{}", e).into())
|
||||
}
|
||||
GetActiveTenantError::SwitchedTenant => {
|
||||
// in our HTTP handlers, this error doesn't happen
|
||||
// TODO: separate error types
|
||||
ApiError::ResourceUnavailable("switched tenant".into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -935,7 +930,6 @@ async fn tenant_list_handler(
|
||||
generation: (*gen)
|
||||
.into()
|
||||
.expect("Tenants are always attached with a generation"),
|
||||
gc_blocking: None,
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>();
|
||||
|
||||
@@ -987,7 +981,6 @@ async fn tenant_status(
|
||||
.generation()
|
||||
.into()
|
||||
.expect("Tenants are always attached with a generation"),
|
||||
gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
|
||||
},
|
||||
walredo: tenant.wal_redo_manager_status(),
|
||||
timelines: tenant.list_timeline_ids(),
|
||||
@@ -1162,10 +1155,7 @@ async fn layer_map_info_handler(
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let layer_map_info = timeline
|
||||
.layer_map_info(reset)
|
||||
.await
|
||||
.map_err(|_shutdown| ApiError::ShuttingDown)?;
|
||||
let layer_map_info = timeline.layer_map_info(reset).await;
|
||||
|
||||
json_response(StatusCode::OK, layer_map_info)
|
||||
}
|
||||
@@ -1231,72 +1221,6 @@ async fn evict_timeline_layer_handler(
|
||||
}
|
||||
}
|
||||
|
||||
async fn timeline_gc_blocking_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
block_or_unblock_gc(request, true).await
|
||||
}
|
||||
|
||||
async fn timeline_gc_unblocking_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
block_or_unblock_gc(request, false).await
|
||||
}
|
||||
|
||||
/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
|
||||
///
|
||||
/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
|
||||
async fn block_or_unblock_gc(
|
||||
request: Request<Body>,
|
||||
block: bool,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
use crate::tenant::{
|
||||
remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
|
||||
};
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let fut = async {
|
||||
if block {
|
||||
timeline.block_gc(&tenant).await.map(|_| ())
|
||||
} else {
|
||||
timeline.unblock_gc(&tenant).await
|
||||
}
|
||||
};
|
||||
|
||||
let span = tracing::info_span!(
|
||||
"block_or_unblock_gc",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
timeline_id = %timeline_id,
|
||||
block = block,
|
||||
);
|
||||
|
||||
let res = fut.instrument(span).await;
|
||||
|
||||
res.map_err(|e| {
|
||||
if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
|
||||
ApiError::ShuttingDown
|
||||
} else {
|
||||
ApiError::InternalServerError(e)
|
||||
}
|
||||
})?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Get tenant_size SVG graph along with the JSON data.
|
||||
fn synthetic_size_html_response(
|
||||
inputs: ModelInputs,
|
||||
@@ -1887,7 +1811,7 @@ async fn timeline_detach_ancestor_handler(
|
||||
// drop(tenant);
|
||||
|
||||
let resp = match progress {
|
||||
detach_ancestor::Progress::Prepared(_guard, prepared) => {
|
||||
detach_ancestor::Progress::Prepared(attempt, prepared) => {
|
||||
// it would be great to tag the guard on to the tenant activation future
|
||||
let reparented_timelines = state
|
||||
.tenant_manager
|
||||
@@ -1895,10 +1819,10 @@ async fn timeline_detach_ancestor_handler(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
prepared,
|
||||
attempt,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.context("timeline detach ancestor completion")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
AncestorDetached {
|
||||
@@ -2205,24 +2129,14 @@ async fn secondary_download_handler(
|
||||
|
||||
let timeout = wait.unwrap_or(Duration::MAX);
|
||||
|
||||
let result = tokio::time::timeout(
|
||||
let status = match tokio::time::timeout(
|
||||
timeout,
|
||||
state.secondary_controller.download_tenant(tenant_shard_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
let progress = secondary_tenant.progress.lock().unwrap().clone();
|
||||
|
||||
let status = match result {
|
||||
Ok(Ok(())) => {
|
||||
if progress.layers_downloaded >= progress.layers_total {
|
||||
// Download job ran to completion
|
||||
StatusCode::OK
|
||||
} else {
|
||||
// Download dropped out without errors because it ran out of time budget
|
||||
StatusCode::ACCEPTED
|
||||
}
|
||||
}
|
||||
.await
|
||||
{
|
||||
// Download job ran to completion.
|
||||
Ok(Ok(())) => StatusCode::OK,
|
||||
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
|
||||
// okay. We could get an error here in the unlikely edge case that the tenant
|
||||
// was detached between our check above and executing the download job.
|
||||
@@ -2232,6 +2146,8 @@ async fn secondary_download_handler(
|
||||
Err(_) => StatusCode::ACCEPTED,
|
||||
};
|
||||
|
||||
let progress = secondary_tenant.progress.lock().unwrap().clone();
|
||||
|
||||
json_response(status, progress)
|
||||
}
|
||||
|
||||
@@ -2975,14 +2891,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
|r| api_handler(r, evict_timeline_layer_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
|
||||
|r| api_handler(r, timeline_gc_blocking_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
|
||||
|r| api_handler(r, timeline_gc_unblocking_handler),
|
||||
)
|
||||
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
|
||||
api_handler(r, secondary_upload_handler)
|
||||
})
|
||||
|
||||
@@ -2,29 +2,19 @@ use std::{num::NonZeroUsize, sync::Arc};
|
||||
|
||||
use crate::tenant::ephemeral_file;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||
#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum L0FlushConfig {
|
||||
#[default]
|
||||
PageCached,
|
||||
#[serde(rename_all = "snake_case")]
|
||||
Direct {
|
||||
max_concurrency: NonZeroUsize,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for L0FlushConfig {
|
||||
fn default() -> Self {
|
||||
Self::Direct {
|
||||
// TODO: using num_cpus results in different peak memory usage on different instance types.
|
||||
max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
|
||||
}
|
||||
}
|
||||
Direct { max_concurrency: NonZeroUsize },
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||
|
||||
pub enum Inner {
|
||||
pub(crate) enum Inner {
|
||||
PageCached,
|
||||
Direct { semaphore: tokio::sync::Semaphore },
|
||||
}
|
||||
@@ -40,7 +30,7 @@ impl L0FlushGlobalState {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn inner(&self) -> &Arc<Inner> {
|
||||
pub(crate) fn inner(&self) -> &Arc<Inner> {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,8 +12,6 @@ pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod l0_flush;
|
||||
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
pub use pageserver_api::keyspace;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
pub mod aux_file;
|
||||
@@ -32,13 +30,14 @@ pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
use tenant::{
|
||||
mgr::{BackgroundPurges, TenantManager},
|
||||
secondary,
|
||||
};
|
||||
use tracing::{info, info_span};
|
||||
use tracing::info;
|
||||
|
||||
/// Current storage format version
|
||||
///
|
||||
@@ -64,6 +63,7 @@ pub struct CancellableTask {
|
||||
pub cancel: CancellationToken,
|
||||
}
|
||||
pub struct HttpEndpointListener(pub CancellableTask);
|
||||
pub struct LibpqEndpointListener(pub CancellableTask);
|
||||
pub struct ConsumptionMetricsTasks(pub CancellableTask);
|
||||
pub struct DiskUsageEvictionTask(pub CancellableTask);
|
||||
impl CancellableTask {
|
||||
@@ -77,7 +77,7 @@ impl CancellableTask {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn shutdown_pageserver(
|
||||
http_listener: HttpEndpointListener,
|
||||
page_service: page_service::Listener,
|
||||
libpq_listener: LibpqEndpointListener,
|
||||
consumption_metrics_worker: ConsumptionMetricsTasks,
|
||||
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
||||
tenant_manager: &TenantManager,
|
||||
@@ -87,83 +87,10 @@ pub async fn shutdown_pageserver(
|
||||
exit_code: i32,
|
||||
) {
|
||||
use std::time::Duration;
|
||||
|
||||
// If the orderly shutdown below takes too long, we still want to make
|
||||
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
|
||||
//
|
||||
// (Leftover walredo processes are the hypothesized trigger for the systemd freezes
|
||||
// that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
|
||||
//
|
||||
// We use a thread instead of a tokio task because the background runtime is likely busy
|
||||
// with the final flushing / uploads. This activity here has priority, and due to lack
|
||||
// of scheduling priority feature sin the tokio scheduler, using a separate thread is
|
||||
// an effective priority booster.
|
||||
let walredo_extraordinary_shutdown_thread_span = {
|
||||
let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
|
||||
span.follows_from(tracing::Span::current());
|
||||
span
|
||||
};
|
||||
let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
|
||||
let walredo_extraordinary_shutdown_thread = std::thread::spawn({
|
||||
let walredo_extraordinary_shutdown_thread_cancel =
|
||||
walredo_extraordinary_shutdown_thread_cancel.clone();
|
||||
move || {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let _entered = rt.enter();
|
||||
let _entered = walredo_extraordinary_shutdown_thread_span.enter();
|
||||
if let Ok(()) = rt.block_on(tokio::time::timeout(
|
||||
Duration::from_secs(8),
|
||||
walredo_extraordinary_shutdown_thread_cancel.cancelled(),
|
||||
)) {
|
||||
info!("cancellation requested");
|
||||
return;
|
||||
}
|
||||
let managers = tenant::WALREDO_MANAGERS
|
||||
.lock()
|
||||
.unwrap()
|
||||
// prevents new walredo managers from being inserted
|
||||
.take()
|
||||
.expect("only we take()");
|
||||
// Use FuturesUnordered to get in queue early for each manager's
|
||||
// heavier_once_cell semaphore wait list.
|
||||
// Also, for idle tenants that for some reason haven't
|
||||
// shut down yet, it's quite likely that we're not going
|
||||
// to get Poll::Pending once.
|
||||
let mut futs: FuturesUnordered<_> = managers
|
||||
.into_iter()
|
||||
.filter_map(|(_, mgr)| mgr.upgrade())
|
||||
.map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
|
||||
.collect();
|
||||
info!(count=%futs.len(), "built FuturesUnordered");
|
||||
let mut last_log_at = std::time::Instant::now();
|
||||
#[derive(Debug, Default)]
|
||||
struct Results {
|
||||
initiated: u64,
|
||||
already: u64,
|
||||
}
|
||||
let mut results = Results::default();
|
||||
while let Some(we_initiated) = rt.block_on(futs.next()) {
|
||||
if we_initiated {
|
||||
results.initiated += 1;
|
||||
} else {
|
||||
results.already += 1;
|
||||
}
|
||||
if last_log_at.elapsed() > Duration::from_millis(100) {
|
||||
info!(remaining=%futs.len(), ?results, "progress");
|
||||
last_log_at = std::time::Instant::now();
|
||||
}
|
||||
}
|
||||
info!(?results, "done");
|
||||
}
|
||||
});
|
||||
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
let remaining_connections = timed(
|
||||
page_service.stop_accepting(),
|
||||
timed(
|
||||
libpq_listener.0.shutdown(),
|
||||
"shutdown LibpqEndpointListener",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
@@ -181,7 +108,7 @@ pub async fn shutdown_pageserver(
|
||||
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
|
||||
// should already have been canclled via mgr::shutdown_all_tenants
|
||||
timed(
|
||||
remaining_connections.shutdown(),
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
||||
"shutdown PageRequestHandlers",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
@@ -235,12 +162,6 @@ pub async fn shutdown_pageserver(
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
info!("cancel & join walredo_extraordinary_shutdown_thread");
|
||||
walredo_extraordinary_shutdown_thread_cancel.cancel();
|
||||
walredo_extraordinary_shutdown_thread.join().unwrap();
|
||||
info!("walredo_extraordinary_shutdown_thread done");
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
@@ -525,15 +525,6 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_visible_physical_size",
|
||||
"The size of the layer files present in the pageserver's filesystem.",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_resident_physical_size_global",
|
||||
@@ -622,23 +613,7 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_compression_image_in_bytes_total",
|
||||
"Size of data written into image layers before compression"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_compression_image_in_bytes_considered",
|
||||
"Size of potentially compressible data written into image layers before compression"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_compression_image_in_bytes_chosen",
|
||||
"Size of data whose compressed form was written into image layers"
|
||||
"Size of uncompressed data written into image layers"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -2213,7 +2188,6 @@ pub(crate) struct TimelineMetrics {
|
||||
pub(crate) layer_count_delta: UIntGauge,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
pub visible_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub aux_file_size_gauge: IntGauge,
|
||||
@@ -2336,9 +2310,6 @@ impl TimelineMetrics {
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
// TODO: we shouldn't expose this metric
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
@@ -2393,7 +2364,6 @@ impl TimelineMetrics {
|
||||
layer_count_delta,
|
||||
standby_horizon_gauge,
|
||||
resident_physical_size_gauge,
|
||||
visible_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
aux_file_size_gauge,
|
||||
directory_entries_count_gauge,
|
||||
@@ -2445,7 +2415,6 @@ impl TimelineMetrics {
|
||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -8,7 +8,8 @@ use std::time::Duration;
|
||||
pub use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
/// A 'value' stored for a one Key.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(test, derive(PartialEq))]
|
||||
pub enum Value {
|
||||
/// An Image value contains a full copy of the value
|
||||
Image(Bytes),
|
||||
|
||||
@@ -56,6 +56,7 @@ impl Statvfs {
|
||||
}
|
||||
|
||||
pub mod mock {
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use regex::Regex;
|
||||
use tracing::log::info;
|
||||
@@ -134,30 +135,14 @@ pub mod mock {
|
||||
{
|
||||
continue;
|
||||
}
|
||||
let m = match entry.metadata() {
|
||||
Ok(m) => m,
|
||||
Err(e) if is_not_found(&e) => {
|
||||
// some temp file which got removed right as we are walking
|
||||
continue;
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(anyhow::Error::new(e)
|
||||
.context(format!("get metadata of {:?}", entry.path())))
|
||||
}
|
||||
};
|
||||
total += m.len();
|
||||
total += entry
|
||||
.metadata()
|
||||
.with_context(|| format!("get metadata of {:?}", entry.path()))?
|
||||
.len();
|
||||
}
|
||||
Ok(total)
|
||||
}
|
||||
|
||||
fn is_not_found(e: &walkdir::Error) -> bool {
|
||||
let Some(io_error) = e.io_error() else {
|
||||
return false;
|
||||
};
|
||||
let kind = io_error.kind();
|
||||
matches!(kind, std::io::ErrorKind::NotFound)
|
||||
}
|
||||
|
||||
pub struct Statvfs {
|
||||
pub blocks: u64,
|
||||
pub blocks_available: u64,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -28,12 +28,6 @@ use crate::virtual_file::VirtualFile;
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CompressionInfo {
|
||||
pub written_compressed: bool,
|
||||
pub compressed_size: Option<usize>,
|
||||
}
|
||||
|
||||
impl<'a> BlockCursor<'a> {
|
||||
/// Read a blob into a new buffer.
|
||||
pub async fn read_blob(
|
||||
@@ -279,10 +273,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
srcbuf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<u64, Error>) {
|
||||
let (buf, res) = self
|
||||
.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
|
||||
.await;
|
||||
(buf, res.map(|(off, _compression_info)| off))
|
||||
self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Write a blob of data. Returns the offset that it was written to,
|
||||
@@ -292,12 +284,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
srcbuf: B,
|
||||
ctx: &RequestContext,
|
||||
algorithm: ImageCompressionAlgorithm,
|
||||
) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
|
||||
) -> (B::Buf, Result<u64, Error>) {
|
||||
let offset = self.offset;
|
||||
let mut compression_info = CompressionInfo {
|
||||
written_compressed: false,
|
||||
compressed_size: None,
|
||||
};
|
||||
|
||||
let len = srcbuf.bytes_init();
|
||||
|
||||
@@ -340,9 +328,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
encoder.write_all(&slice[..]).await.unwrap();
|
||||
encoder.shutdown().await.unwrap();
|
||||
let compressed = encoder.into_inner();
|
||||
compression_info.compressed_size = Some(compressed.len());
|
||||
if compressed.len() < len {
|
||||
compression_info.written_compressed = true;
|
||||
let compressed_len = compressed.len();
|
||||
compressed_buf = Some(compressed);
|
||||
(BYTE_ZSTD, compressed_len, slice.into_inner())
|
||||
@@ -373,7 +359,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
} else {
|
||||
self.write_all(srcbuf, ctx).await
|
||||
};
|
||||
(srcbuf, res.map(|_| (offset, compression_info)))
|
||||
(srcbuf, res.map(|_| offset))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -430,14 +416,12 @@ pub(crate) mod tests {
|
||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||
for blob in blobs.iter() {
|
||||
let (_, res) = if compression {
|
||||
let res = wtr
|
||||
.write_blob_maybe_compressed(
|
||||
blob.clone(),
|
||||
ctx,
|
||||
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
||||
)
|
||||
.await;
|
||||
(res.0, res.1.map(|(off, _)| off))
|
||||
wtr.write_blob_maybe_compressed(
|
||||
blob.clone(),
|
||||
ctx,
|
||||
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
wtr.write_blob(blob.clone(), ctx).await
|
||||
};
|
||||
|
||||
@@ -296,19 +296,13 @@ where
|
||||
let mut stack = Vec::new();
|
||||
stack.push((self.root_blk, None));
|
||||
let block_cursor = self.reader.block_cursor();
|
||||
let mut node_buf = [0_u8; PAGE_SZ];
|
||||
while let Some((node_blknum, opt_iter)) = stack.pop() {
|
||||
// Read the node, through the PS PageCache, into local variable `node_buf`.
|
||||
// We could keep the page cache read guard alive, but, at the time of writing,
|
||||
// we run quite small PS PageCache s => can't risk running out of
|
||||
// PageCache space because this stream isn't consumed fast enough.
|
||||
let page_read_guard = block_cursor
|
||||
// Locate the node.
|
||||
let node_buf = block_cursor
|
||||
.read_blk(self.start_blk + node_blknum, ctx)
|
||||
.await?;
|
||||
node_buf.copy_from_slice(page_read_guard.as_ref());
|
||||
drop(page_read_guard); // drop page cache read guard early
|
||||
|
||||
let node = OnDiskNode::deparse(&node_buf)?;
|
||||
let node = OnDiskNode::deparse(node_buf.as_ref())?;
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
|
||||
@@ -351,7 +345,6 @@ where
|
||||
Either::Left(idx..node.num_children.into())
|
||||
};
|
||||
|
||||
|
||||
// idx points to the first match now. Keep going from there
|
||||
while let Some(idx) = iter.next() {
|
||||
let key_off = idx * suffix_len;
|
||||
|
||||
@@ -29,7 +29,6 @@ impl EphemeralFile {
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<EphemeralFile, io::Error> {
|
||||
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
|
||||
@@ -52,12 +51,10 @@ impl EphemeralFile {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let prewarm = conf.l0_flush.prewarm_on_write();
|
||||
|
||||
Ok(EphemeralFile {
|
||||
_tenant_shard_id: tenant_shard_id,
|
||||
_timeline_id: timeline_id,
|
||||
rw: page_caching::RW::new(file, prewarm, gate_guard),
|
||||
rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -164,11 +161,7 @@ mod tests {
|
||||
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let entered = gate.enter().unwrap();
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;
|
||||
|
||||
let pos_foo = file.write_blob(b"foo", &ctx).await?;
|
||||
assert_eq!(
|
||||
@@ -222,38 +215,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn ephemeral_file_holds_gate_open() {
|
||||
const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
|
||||
|
||||
let (conf, tenant_id, timeline_id, ctx) =
|
||||
harness("ephemeral_file_holds_gate_open").unwrap();
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut closing = tokio::task::spawn(async move {
|
||||
gate.close().await;
|
||||
});
|
||||
|
||||
// gate is entered until the ephemeral file is dropped
|
||||
// do not start paused tokio-epoll-uring has a sleep loop
|
||||
tokio::time::pause();
|
||||
tokio::time::timeout(FOREVER, &mut closing)
|
||||
.await
|
||||
.expect_err("closing cannot complete before dropping");
|
||||
|
||||
// this is a requirement of the reset_tenant functionality: we have to be able to restart a
|
||||
// tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate
|
||||
drop(file);
|
||||
|
||||
tokio::time::timeout(FOREVER, &mut closing)
|
||||
.await
|
||||
.expect("closing completes right away")
|
||||
.expect("closing does not panic");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,8 +18,6 @@ use super::zero_padded_read_write;
|
||||
pub struct RW {
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
|
||||
/// Gate guard is held on as long as we need to do operations in the path (delete on drop).
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
|
||||
@@ -31,11 +29,7 @@ pub enum PrewarmOnWrite {
|
||||
}
|
||||
|
||||
impl RW {
|
||||
pub fn new(
|
||||
file: VirtualFile,
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
) -> Self {
|
||||
pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
|
||||
let page_cache_file_id = page_cache::next_file_id();
|
||||
Self {
|
||||
page_cache_file_id,
|
||||
@@ -44,7 +38,6 @@ impl RW {
|
||||
file,
|
||||
prewarm_on_write,
|
||||
)),
|
||||
_gate_guard,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,7 +145,6 @@ impl Drop for RW {
|
||||
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
|
||||
|
||||
// unlink the file
|
||||
// we are clear to do this, because we have entered a gate
|
||||
let res = std::fs::remove_file(&self.rw.as_writer().file.path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
|
||||
@@ -1,213 +0,0 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::remote_timeline_client::index::GcBlockingReason;
|
||||
|
||||
type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct GcBlock {
|
||||
/// The timelines which have current reasons to block gc.
|
||||
///
|
||||
/// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
|
||||
/// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
|
||||
reasons: std::sync::Mutex<Storage>,
|
||||
blocking: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
impl GcBlock {
|
||||
/// Start another gc iteration.
|
||||
///
|
||||
/// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
|
||||
/// it's ending, or if not currently possible, a value describing the reasons why not.
|
||||
///
|
||||
/// Cancellation safe.
|
||||
pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
|
||||
let reasons = {
|
||||
let g = self.reasons.lock().unwrap();
|
||||
|
||||
// TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
|
||||
// tests, we use everything. we should warn if the gc has been consecutively blocked
|
||||
// for more than 1h (within single tenant session?).
|
||||
BlockingReasons::clean_and_summarize(g)
|
||||
};
|
||||
|
||||
if let Some(reasons) = reasons {
|
||||
Err(reasons)
|
||||
} else {
|
||||
Ok(Guard {
|
||||
_inner: self.blocking.lock().await,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn summary(&self) -> Option<BlockingReasons> {
|
||||
let g = self.reasons.lock().unwrap();
|
||||
|
||||
BlockingReasons::summarize(&g)
|
||||
}
|
||||
|
||||
/// Start blocking gc for this one timeline for the given reason.
|
||||
///
|
||||
/// This is not a guard based API but instead it mimics set API. The returned future will not
|
||||
/// resolve until an existing gc round has completed.
|
||||
///
|
||||
/// Returns true if this block was new, false if gc was already blocked for this reason.
|
||||
///
|
||||
/// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
|
||||
/// keep the gc blocking reason.
|
||||
pub(crate) async fn insert(
|
||||
&self,
|
||||
timeline: &super::Timeline,
|
||||
reason: GcBlockingReason,
|
||||
) -> anyhow::Result<bool> {
|
||||
let (added, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
let set = g.entry(timeline.timeline_id).or_default();
|
||||
let added = set.insert(reason);
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock, see self.reasons.
|
||||
let uploaded = timeline
|
||||
.remote_client
|
||||
.schedule_insert_gc_block_reason(reason)?;
|
||||
|
||||
(added, uploaded)
|
||||
};
|
||||
|
||||
uploaded.await?;
|
||||
|
||||
// ensure that any ongoing gc iteration has completed
|
||||
drop(self.blocking.lock().await);
|
||||
|
||||
Ok(added)
|
||||
}
|
||||
|
||||
/// Remove blocking gc for this one timeline and the given reason.
|
||||
pub(crate) async fn remove(
|
||||
&self,
|
||||
timeline: &super::Timeline,
|
||||
reason: GcBlockingReason,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::collections::hash_map::Entry;
|
||||
|
||||
super::span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let (remaining_blocks, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
match g.entry(timeline.timeline_id) {
|
||||
Entry::Occupied(mut oe) => {
|
||||
let set = oe.get_mut();
|
||||
set.remove(reason);
|
||||
if set.is_empty() {
|
||||
oe.remove();
|
||||
}
|
||||
}
|
||||
Entry::Vacant(_) => {
|
||||
// we must still do the index_part.json update regardless, in case we had earlier
|
||||
// been cancelled
|
||||
}
|
||||
}
|
||||
|
||||
let remaining_blocks = g.len();
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
|
||||
let uploaded = timeline
|
||||
.remote_client
|
||||
.schedule_remove_gc_block_reason(reason)?;
|
||||
|
||||
(remaining_blocks, uploaded)
|
||||
};
|
||||
uploaded.await?;
|
||||
|
||||
// no need to synchronize with gc iteration again
|
||||
|
||||
if remaining_blocks > 0 {
|
||||
tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
|
||||
} else {
|
||||
tracing::info!("gc is now unblocked for the tenant");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
|
||||
let unblocked = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
if g.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
g.remove(&timeline.timeline_id);
|
||||
|
||||
BlockingReasons::clean_and_summarize(g).is_none()
|
||||
};
|
||||
|
||||
if unblocked {
|
||||
tracing::info!("gc is now unblocked following deletion");
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize with the non-deleted timelines of this tenant.
|
||||
pub(crate) fn set_scanned(&self, scanned: Storage) {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
assert!(g.is_empty());
|
||||
g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
|
||||
|
||||
if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
|
||||
tracing::info!(summary=?reasons, "initialized with gc blocked");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct Guard<'a> {
|
||||
_inner: tokio::sync::MutexGuard<'a, ()>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct BlockingReasons {
|
||||
timelines: usize,
|
||||
reasons: enumset::EnumSet<GcBlockingReason>,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BlockingReasons {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{} timelines block for {:?}",
|
||||
self.timelines, self.reasons
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockingReasons {
|
||||
fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
let mut reasons = enumset::EnumSet::empty();
|
||||
g.retain(|_key, value| {
|
||||
reasons = reasons.union(*value);
|
||||
!value.is_empty()
|
||||
});
|
||||
if !g.is_empty() {
|
||||
Some(BlockingReasons {
|
||||
timelines: g.len(),
|
||||
reasons,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
if g.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let reasons = g
|
||||
.values()
|
||||
.fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
|
||||
Some(BlockingReasons {
|
||||
timelines: g.len(),
|
||||
reasons,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -51,8 +51,7 @@ use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Range;
|
||||
@@ -62,7 +61,7 @@ use utils::lsn::Lsn;
|
||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||
pub use historic_layer_coverage::LayerKey;
|
||||
|
||||
use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
|
||||
use super::storage_layer::PersistentLayerDesc;
|
||||
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
@@ -846,8 +845,8 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
|
||||
&self.l0_delta_layers
|
||||
pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
|
||||
self.l0_delta_layers.to_vec()
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
@@ -872,183 +871,11 @@ impl LayerMap {
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// `read_points` represent the tip of a timeline and any branch points, i.e. the places
|
||||
/// where we expect to serve reads.
|
||||
///
|
||||
/// This function is O(N) and should be called infrequently. The caller is responsible for
|
||||
/// looking up and updating the Layer objects for these layer descriptors.
|
||||
pub fn get_visibility(
|
||||
&self,
|
||||
mut read_points: Vec<Lsn>,
|
||||
) -> (
|
||||
Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
|
||||
KeySpace,
|
||||
) {
|
||||
// This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
|
||||
// KeySpace is intended to be composed statically and iterated over.
|
||||
struct KeyShadow {
|
||||
// Map of range start to range end
|
||||
inner: RangeSetBlaze<i128>,
|
||||
}
|
||||
|
||||
impl KeyShadow {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
inner: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn contains(&self, range: Range<Key>) -> bool {
|
||||
let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
|
||||
self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
|
||||
CheckSortedDisjoint::from([range_incl]),
|
||||
))
|
||||
}
|
||||
|
||||
/// Add the input range to the keys covered by self.
|
||||
///
|
||||
/// Return true if inserting this range covered some keys that were previously not covered
|
||||
fn cover(&mut self, insert: Range<Key>) -> bool {
|
||||
let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
|
||||
self.inner.ranges_insert(range_incl)
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
self.inner = Default::default();
|
||||
}
|
||||
|
||||
fn to_keyspace(&self) -> KeySpace {
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
for range_incl in self.inner.ranges() {
|
||||
let range = Range {
|
||||
start: Key::from_i128(*range_incl.start()),
|
||||
end: Key::from_i128(range_incl.end() + 1),
|
||||
};
|
||||
accum.add_range(range)
|
||||
}
|
||||
|
||||
accum.to_keyspace()
|
||||
}
|
||||
}
|
||||
|
||||
// The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
|
||||
// and a ReadPoint
|
||||
read_points.sort_by_key(|rp| rp.0);
|
||||
let mut shadow = KeyShadow::new();
|
||||
|
||||
// We will interleave all our read points and layers into a sorted collection
|
||||
enum Item {
|
||||
ReadPoint { lsn: Lsn },
|
||||
Layer(Arc<PersistentLayerDesc>),
|
||||
}
|
||||
|
||||
let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
|
||||
items.extend(self.iter_historic_layers().map(Item::Layer));
|
||||
items.extend(
|
||||
read_points
|
||||
.into_iter()
|
||||
.map(|rp| Item::ReadPoint { lsn: rp }),
|
||||
);
|
||||
|
||||
// Ordering: we want to iterate like this:
|
||||
// 1. Highest LSNs first
|
||||
// 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
|
||||
// 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
|
||||
items.sort_by_key(|item| {
|
||||
std::cmp::Reverse(match item {
|
||||
Item::Layer(layer) => {
|
||||
if layer.is_delta() {
|
||||
(Lsn(layer.get_lsn_range().end.0 - 1), 0)
|
||||
} else {
|
||||
(layer.image_layer_lsn(), 1)
|
||||
}
|
||||
}
|
||||
Item::ReadPoint { lsn } => (*lsn, 2),
|
||||
})
|
||||
});
|
||||
|
||||
let mut results = Vec::with_capacity(self.historic.len());
|
||||
|
||||
let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
|
||||
|
||||
for item in items {
|
||||
let (reached_lsn, is_readpoint) = match &item {
|
||||
Item::ReadPoint { lsn } => (lsn, true),
|
||||
Item::Layer(layer) => (&layer.lsn_range.start, false),
|
||||
};
|
||||
maybe_covered_deltas.retain(|d| {
|
||||
if *reached_lsn >= d.lsn_range.start && is_readpoint {
|
||||
// We encountered a readpoint within the delta layer: it is visible
|
||||
|
||||
results.push((d.clone(), LayerVisibilityHint::Visible));
|
||||
false
|
||||
} else if *reached_lsn < d.lsn_range.start {
|
||||
// We passed the layer's range without encountering a read point: it is not visible
|
||||
results.push((d.clone(), LayerVisibilityHint::Covered));
|
||||
false
|
||||
} else {
|
||||
// We're still in the delta layer: continue iterating
|
||||
true
|
||||
}
|
||||
});
|
||||
|
||||
match item {
|
||||
Item::ReadPoint { lsn: _lsn } => {
|
||||
// TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
|
||||
// to assume that the whole key range is visible at the branch point.
|
||||
shadow.reset();
|
||||
}
|
||||
Item::Layer(layer) => {
|
||||
let visibility = if layer.is_delta() {
|
||||
if shadow.contains(layer.get_key_range()) {
|
||||
// If a layer isn't visible based on current state, we must defer deciding whether
|
||||
// it is truly not visible until we have advanced past the delta's range: we might
|
||||
// encounter another branch point within this delta layer's LSN range.
|
||||
maybe_covered_deltas.push(layer);
|
||||
continue;
|
||||
} else {
|
||||
LayerVisibilityHint::Visible
|
||||
}
|
||||
} else {
|
||||
let modified = shadow.cover(layer.get_key_range());
|
||||
if modified {
|
||||
// An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
|
||||
LayerVisibilityHint::Visible
|
||||
} else {
|
||||
// An image layer in a region that was already covered
|
||||
LayerVisibilityHint::Covered
|
||||
}
|
||||
};
|
||||
|
||||
results.push((layer, visibility));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Drain any remaining maybe_covered deltas
|
||||
results.extend(
|
||||
maybe_covered_deltas
|
||||
.into_iter()
|
||||
.map(|d| (d, LayerVisibilityHint::Covered)),
|
||||
);
|
||||
|
||||
(results, shadow.to_keyspace())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::tenant::{storage_layer::LayerName, IndexPart};
|
||||
use pageserver_api::{
|
||||
key::DBDIR_KEY,
|
||||
keyspace::{KeySpace, KeySpaceRandomAccum},
|
||||
};
|
||||
use std::{collections::HashMap, path::PathBuf};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -1175,299 +1002,4 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layer_visibility_basic() {
|
||||
// A simple synthetic input, as a smoke test.
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
let timeline_id = TimelineId::generate();
|
||||
let mut layer_map = LayerMap::default();
|
||||
let mut updates = layer_map.batch_update();
|
||||
|
||||
const FAKE_LAYER_SIZE: u64 = 1024;
|
||||
|
||||
let inject_delta = |updates: &mut BatchedUpdates,
|
||||
key_start: i128,
|
||||
key_end: i128,
|
||||
lsn_start: u64,
|
||||
lsn_end: u64| {
|
||||
let desc = PersistentLayerDesc::new_delta(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
Range {
|
||||
start: Key::from_i128(key_start),
|
||||
end: Key::from_i128(key_end),
|
||||
},
|
||||
Range {
|
||||
start: Lsn(lsn_start),
|
||||
end: Lsn(lsn_end),
|
||||
},
|
||||
1024,
|
||||
);
|
||||
updates.insert_historic(desc.clone());
|
||||
desc
|
||||
};
|
||||
|
||||
let inject_image =
|
||||
|updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
|
||||
let desc = PersistentLayerDesc::new_img(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
Range {
|
||||
start: Key::from_i128(key_start),
|
||||
end: Key::from_i128(key_end),
|
||||
},
|
||||
Lsn(lsn),
|
||||
FAKE_LAYER_SIZE,
|
||||
);
|
||||
updates.insert_historic(desc.clone());
|
||||
desc
|
||||
};
|
||||
|
||||
//
|
||||
// Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
|
||||
// we expect to handle. You can follow these examples through in the same order as they would be processed
|
||||
// by the function under test.
|
||||
//
|
||||
|
||||
let mut read_points = vec![Lsn(1000)];
|
||||
|
||||
// A delta ahead of any image layer
|
||||
let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
|
||||
|
||||
// An image layer is visible and covers some layers beneath itself
|
||||
let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
|
||||
|
||||
// A delta layer covered by the image layer: should be covered
|
||||
let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
|
||||
|
||||
// A delta layer partially covered by an image layer: should be visible
|
||||
let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
|
||||
|
||||
// A delta layer not covered by an image layer: should be visible
|
||||
let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
|
||||
|
||||
// An image layer covered by the image layer above: should be covered
|
||||
let covered_image = inject_image(&mut updates, 10, 20, 89);
|
||||
|
||||
// An image layer partially covered by an image layer: should be visible
|
||||
let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
|
||||
|
||||
// An image layer not covered by an image layer: should be visible
|
||||
let not_covered_image = inject_image(&mut updates, 1, 4, 89);
|
||||
|
||||
// A read point: this will make subsequent layers below here visible, even if there are
|
||||
// more recent layers covering them.
|
||||
read_points.push(Lsn(80));
|
||||
|
||||
// A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
|
||||
let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
|
||||
|
||||
// A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
|
||||
// the read point should make it visible, even though its end LSN is covered
|
||||
let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
|
||||
let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
|
||||
read_points.push(Lsn(65));
|
||||
let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
|
||||
|
||||
let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
|
||||
|
||||
updates.flush();
|
||||
|
||||
let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
|
||||
let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
|
||||
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&ahead_layer),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&visible_covering_img),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covered_delta),
|
||||
Some(&LayerVisibilityHint::Covered)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&partially_covered_delta),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(¬_covered_delta),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covered_image),
|
||||
Some(&LayerVisibilityHint::Covered)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&partially_covered_image),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(¬_covered_image),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covered_delta_below_read_point),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covering_img_between_read_points),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covered_delta_between_read_points),
|
||||
Some(&LayerVisibilityHint::Covered)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covered_delta_intersects_read_point),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&visible_img_after_last_read_point),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
|
||||
// Shadow should include all the images below the last read point
|
||||
let expected_shadow = KeySpace {
|
||||
ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
|
||||
};
|
||||
assert_eq!(shadow, expected_shadow);
|
||||
}
|
||||
|
||||
fn fixture_path(relative: &str) -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layer_visibility_realistic() {
|
||||
// Load a large example layermap
|
||||
let index_raw = std::fs::read_to_string(fixture_path(
|
||||
"test_data/indices/mixed_workload/index_part.json",
|
||||
))
|
||||
.unwrap();
|
||||
let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let mut layer_map = LayerMap::default();
|
||||
let mut updates = layer_map.batch_update();
|
||||
for (layer_name, layer_metadata) in index.layer_metadata {
|
||||
let layer_desc = match layer_name {
|
||||
LayerName::Image(layer_name) => PersistentLayerDesc {
|
||||
key_range: layer_name.key_range.clone(),
|
||||
lsn_range: layer_name.lsn_as_range(),
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
is_delta: false,
|
||||
file_size: layer_metadata.file_size,
|
||||
},
|
||||
LayerName::Delta(layer_name) => PersistentLayerDesc {
|
||||
key_range: layer_name.key_range,
|
||||
lsn_range: layer_name.lsn_range,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
is_delta: true,
|
||||
file_size: layer_metadata.file_size,
|
||||
},
|
||||
};
|
||||
updates.insert_historic(layer_desc);
|
||||
}
|
||||
updates.flush();
|
||||
|
||||
let read_points = vec![index.metadata.disk_consistent_lsn()];
|
||||
let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
|
||||
for (layer_desc, visibility) in &layer_visibilities {
|
||||
tracing::info!("{layer_desc:?}: {visibility:?}");
|
||||
eprintln!("{layer_desc:?}: {visibility:?}");
|
||||
}
|
||||
|
||||
// The shadow should be non-empty, since there were some image layers
|
||||
assert!(!shadow.ranges.is_empty());
|
||||
|
||||
// At least some layers should be marked covered
|
||||
assert!(layer_visibilities
|
||||
.iter()
|
||||
.any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
|
||||
|
||||
let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
|
||||
|
||||
// Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
|
||||
for (layer_desc, visible) in &layer_visibilities {
|
||||
let mut coverage = KeySpaceRandomAccum::new();
|
||||
let mut covered_by = Vec::new();
|
||||
|
||||
for other_layer in layer_map.iter_historic_layers() {
|
||||
if &other_layer == layer_desc {
|
||||
continue;
|
||||
}
|
||||
if !other_layer.is_delta()
|
||||
&& other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
|
||||
&& other_layer.key_range.start <= layer_desc.key_range.end
|
||||
&& layer_desc.key_range.start <= other_layer.key_range.end
|
||||
{
|
||||
coverage.add_range(other_layer.get_key_range());
|
||||
covered_by.push((*other_layer).clone());
|
||||
}
|
||||
}
|
||||
let coverage = coverage.to_keyspace();
|
||||
|
||||
let expect_visible = if coverage.ranges.len() == 1
|
||||
&& coverage.contains(&layer_desc.key_range.start)
|
||||
&& coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
|
||||
{
|
||||
LayerVisibilityHint::Covered
|
||||
} else {
|
||||
LayerVisibilityHint::Visible
|
||||
};
|
||||
|
||||
if expect_visible != *visible {
|
||||
eprintln!(
|
||||
"Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
|
||||
layer_desc.key_range.start,
|
||||
layer_desc.key_range.end,
|
||||
layer_desc.lsn_range.start,
|
||||
layer_desc.lsn_range.end,
|
||||
layer_desc.is_delta()
|
||||
);
|
||||
if expect_visible == LayerVisibilityHint::Covered {
|
||||
eprintln!("Covered by:");
|
||||
for other in covered_by {
|
||||
eprintln!(
|
||||
" {}..{} @ {}",
|
||||
other.get_key_range().start,
|
||||
other.get_key_range().end,
|
||||
other.image_layer_lsn()
|
||||
);
|
||||
}
|
||||
if let Some(range) = coverage.ranges.first() {
|
||||
eprintln!(
|
||||
"Total coverage from contributing layers: {}..{}",
|
||||
range.start, range.end
|
||||
);
|
||||
} else {
|
||||
eprintln!(
|
||||
"Total coverage from contributing layers: {:?}",
|
||||
coverage.ranges
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
assert_eq!(expect_visible, *visible);
|
||||
}
|
||||
|
||||
// Sanity: the layer that holds latest data for the DBDIR key should always be visible
|
||||
// (just using this key as a key that will always exist for any layermap fixture)
|
||||
let dbdir_layer = layer_map
|
||||
.search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
|
||||
.unwrap();
|
||||
assert!(matches!(
|
||||
layer_visibilities.get(&dbdir_layer.layer).unwrap(),
|
||||
LayerVisibilityHint::Visible
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -521,10 +521,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
|
||||
Ok(&self.historic_coverage)
|
||||
}
|
||||
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.layers.len()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -111,7 +111,7 @@ impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
|
||||
#[error("re-serializing for crc32 failed")]
|
||||
struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);
|
||||
|
||||
const METADATA_HDR_SIZE: usize = size_of::<TimelineMetadataHeader>();
|
||||
const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct TimelineMetadataBodyV2 {
|
||||
@@ -285,21 +285,30 @@ impl TimelineMetadata {
|
||||
}
|
||||
|
||||
/// When reparenting, the `ancestor_lsn` does not change.
|
||||
pub fn reparent(&mut self, timeline: &TimelineId) {
|
||||
///
|
||||
/// Returns true if anything was changed.
|
||||
pub fn reparent(&mut self, timeline: &TimelineId) -> bool {
|
||||
assert!(self.body.ancestor_timeline.is_some());
|
||||
// no assertion for redoing this: it's fine, we may have to repeat this multiple times over
|
||||
let prev = self.body.ancestor_timeline;
|
||||
self.body.ancestor_timeline = Some(*timeline);
|
||||
prev.as_ref() != Some(timeline)
|
||||
}
|
||||
|
||||
pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
|
||||
/// Returns true if anything was changed
|
||||
pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool {
|
||||
let mut changed = false;
|
||||
if let Some(ancestor) = self.body.ancestor_timeline {
|
||||
assert_eq!(ancestor, branchpoint.0);
|
||||
changed = true;
|
||||
}
|
||||
if self.body.ancestor_lsn != Lsn(0) {
|
||||
assert_eq!(self.body.ancestor_lsn, branchpoint.1);
|
||||
changed = true;
|
||||
}
|
||||
self.body.ancestor_timeline = None;
|
||||
self.body.ancestor_lsn = Lsn(0);
|
||||
changed
|
||||
}
|
||||
|
||||
pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
|
||||
|
||||
@@ -54,8 +54,8 @@ use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::remote_timeline_client::remote_tenant_path;
|
||||
use super::secondary::SecondaryTenant;
|
||||
use super::timeline::detach_ancestor::PreparedTimelineDetach;
|
||||
use super::{GlobalShutDown, TenantSharedResources};
|
||||
use super::timeline::detach_ancestor::{self, PreparedTimelineDetach};
|
||||
use super::TenantSharedResources;
|
||||
|
||||
/// For a tenant that appears in TenantsMap, it may either be
|
||||
/// - `Attached`: has a full Tenant object, is elegible to service
|
||||
@@ -116,6 +116,8 @@ pub(crate) enum ShardSelector {
|
||||
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
|
||||
/// ignore it.
|
||||
Zero,
|
||||
/// Pick the first shard we find for the TenantId
|
||||
First,
|
||||
/// Pick the shard that holds this key
|
||||
Page(Key),
|
||||
/// The shard ID is known: pick the given shard
|
||||
@@ -224,8 +226,21 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
||||
}
|
||||
|
||||
/// See [`Self::spawn`].
|
||||
#[derive(Clone, Default)]
|
||||
pub struct BackgroundPurges(tokio_util::task::TaskTracker);
|
||||
#[derive(Clone)]
|
||||
pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
|
||||
enum BackgroundPurgesInner {
|
||||
Open(tokio::task::JoinSet<()>),
|
||||
// we use the async mutex for coalescing
|
||||
ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
|
||||
}
|
||||
|
||||
impl Default for BackgroundPurges {
|
||||
fn default() -> Self {
|
||||
Self(Arc::new(std::sync::Mutex::new(
|
||||
BackgroundPurgesInner::Open(JoinSet::new()),
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
impl BackgroundPurges {
|
||||
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
|
||||
@@ -234,32 +249,24 @@ impl BackgroundPurges {
|
||||
/// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
||||
/// Thus the [`BackgroundPurges`] type to keep track of these tasks.
|
||||
pub fn spawn(&self, tmp_path: Utf8PathBuf) {
|
||||
// because on shutdown we close and wait, we are misusing TaskTracker a bit.
|
||||
//
|
||||
// so first acquire a token, then check if the tracker has been closed. the tracker might get closed
|
||||
// right after, but at least the shutdown will wait for what we are spawning next.
|
||||
let token = self.0.token();
|
||||
|
||||
if self.0.is_closed() {
|
||||
warn!(
|
||||
%tmp_path,
|
||||
"trying to spawn background purge during shutdown, ignoring"
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
let span = info_span!(parent: None, "background_purge", %tmp_path);
|
||||
|
||||
let task = move || {
|
||||
let _token = token;
|
||||
let _entered = span.entered();
|
||||
if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) {
|
||||
// should we fatal_io_error here?
|
||||
warn!(%error, "failed to purge tenant directory");
|
||||
let mut guard = self.0.lock().unwrap();
|
||||
let jset = match &mut *guard {
|
||||
BackgroundPurgesInner::Open(ref mut jset) => jset,
|
||||
BackgroundPurgesInner::ShuttingDown(_) => {
|
||||
warn!("trying to spawn background purge during shutdown, ignoring");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
BACKGROUND_RUNTIME.spawn_blocking(task);
|
||||
jset.spawn_on(
|
||||
async move {
|
||||
if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
|
||||
// should we fatal_io_error here?
|
||||
warn!(%error, path=%tmp_path, "failed to purge tenant directory");
|
||||
}
|
||||
}
|
||||
.instrument(info_span!(parent: None, "background_purge")),
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
);
|
||||
}
|
||||
|
||||
/// When this future completes, all background purges have completed.
|
||||
@@ -273,9 +280,42 @@ impl BackgroundPurges {
|
||||
/// instances of this future will continue to be correct.
|
||||
#[instrument(skip_all)]
|
||||
pub async fn shutdown(&self) {
|
||||
// forbid new tasks (can be called many times)
|
||||
self.0.close();
|
||||
self.0.wait().await;
|
||||
let jset = {
|
||||
let mut guard = self.0.lock().unwrap();
|
||||
match &mut *guard {
|
||||
BackgroundPurgesInner::Open(jset) => {
|
||||
*guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
|
||||
std::mem::take(jset),
|
||||
)))
|
||||
}
|
||||
BackgroundPurgesInner::ShuttingDown(_) => {
|
||||
// calling shutdown multiple times is most likely a bug in pageserver shutdown code
|
||||
warn!("already shutting down");
|
||||
}
|
||||
};
|
||||
match &mut *guard {
|
||||
BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
|
||||
BackgroundPurgesInner::Open(_) => {
|
||||
unreachable!("above code transitions into shut down state");
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut jset = jset.lock().await; // concurrent callers coalesce here
|
||||
while let Some(res) = jset.join_next().await {
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.is_panic() => {
|
||||
// If it panicked, the error is already logged by the panic hook.
|
||||
}
|
||||
Err(e) if e.is_cancelled() => {
|
||||
unreachable!("we don't cancel the joinset or runtime")
|
||||
}
|
||||
Err(e) => {
|
||||
// No idea when this can happen, but let's log it.
|
||||
warn!(%e, "background purge task failed or panicked");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -627,20 +667,18 @@ pub async fn init_tenant_mgr(
|
||||
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
||||
let shard_identity = location_conf.shard;
|
||||
let slot = match location_conf.mode {
|
||||
LocationMode::Attached(attached_conf) => TenantSlot::Attached(
|
||||
tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
)
|
||||
.expect("global shutdown during init_tenant_mgr cannot happen"),
|
||||
),
|
||||
LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
SpawnMode::Lazy,
|
||||
None,
|
||||
&ctx,
|
||||
)),
|
||||
LocationMode::Secondary(secondary_conf) => {
|
||||
info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
@@ -687,8 +725,9 @@ fn tenant_spawn(
|
||||
shard_identity: ShardIdentity,
|
||||
init_order: Option<InitializationOrder>,
|
||||
mode: SpawnMode,
|
||||
existing_detach_attempt: Option<&detach_ancestor::Attempt>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Tenant>, GlobalShutDown> {
|
||||
) -> Arc<Tenant> {
|
||||
// All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
|
||||
// path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode
|
||||
// to avoid impacting prod runtime performance.
|
||||
@@ -707,6 +746,7 @@ fn tenant_spawn(
|
||||
shard_identity,
|
||||
init_order,
|
||||
mode,
|
||||
existing_detach_attempt,
|
||||
ctx,
|
||||
)
|
||||
}
|
||||
@@ -1154,11 +1194,9 @@ impl TenantManager {
|
||||
shard_identity,
|
||||
None,
|
||||
spawn_mode,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
.map_err(|_: GlobalShutDown| {
|
||||
UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
|
||||
})?;
|
||||
);
|
||||
|
||||
TenantSlot::Attached(tenant)
|
||||
}
|
||||
@@ -1278,8 +1316,9 @@ impl TenantManager {
|
||||
shard_identity,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
None,
|
||||
ctx,
|
||||
)?;
|
||||
);
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
|
||||
@@ -1350,32 +1389,34 @@ impl TenantManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let remote_path = remote_tenant_path(&tenant_shard_id);
|
||||
let mut keys_stream = self.resources.remote_storage.list_streaming(
|
||||
Some(&remote_path),
|
||||
remote_storage::ListingMode::NoDelimiter,
|
||||
None,
|
||||
&self.cancel,
|
||||
);
|
||||
while let Some(chunk) = keys_stream.next().await {
|
||||
let keys = match chunk {
|
||||
Ok(listing) => listing.keys,
|
||||
Err(remote_storage::DownloadError::Cancelled) => {
|
||||
return Err(DeleteTenantError::Cancelled)
|
||||
}
|
||||
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
|
||||
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
|
||||
};
|
||||
|
||||
if keys.is_empty() {
|
||||
tracing::info!("Remote storage already deleted");
|
||||
} else {
|
||||
tracing::info!("Deleting {} keys from remote storage", keys.len());
|
||||
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
|
||||
self.resources
|
||||
.remote_storage
|
||||
.delete_objects(&keys, &self.cancel)
|
||||
.await?;
|
||||
let keys = match self
|
||||
.resources
|
||||
.remote_storage
|
||||
.list(
|
||||
Some(&remote_path),
|
||||
remote_storage::ListingMode::NoDelimiter,
|
||||
None,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(listing) => listing.keys,
|
||||
Err(remote_storage::DownloadError::Cancelled) => {
|
||||
return Err(DeleteTenantError::Cancelled)
|
||||
}
|
||||
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
|
||||
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
|
||||
};
|
||||
|
||||
if keys.is_empty() {
|
||||
tracing::info!("Remote storage already deleted");
|
||||
} else {
|
||||
tracing::info!("Deleting {} keys from remote storage", keys.len());
|
||||
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
|
||||
self.resources
|
||||
.remote_storage
|
||||
.delete_objects(&keys, &self.cancel)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -1729,9 +1770,14 @@ impl TenantManager {
|
||||
let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
|
||||
for timeline in timelines.values() {
|
||||
tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
|
||||
let layers = timeline.layers.read().await;
|
||||
let timeline_layers = timeline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for layer in layers.likely_resident_layers() {
|
||||
for layer in timeline_layers {
|
||||
let relative_path = layer
|
||||
.local_path()
|
||||
.strip_prefix(&parent_path)
|
||||
@@ -1927,6 +1973,7 @@ impl TenantManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
prepared: PreparedTimelineDetach,
|
||||
mut attempt: detach_ancestor::Attempt,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashSet<TimelineId>, anyhow::Error> {
|
||||
// FIXME: this is unnecessary, slotguard already has these semantics
|
||||
@@ -1977,43 +2024,66 @@ impl TenantManager {
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let reparented = timeline
|
||||
.complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
|
||||
let resp = timeline
|
||||
.detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
|
||||
.await?;
|
||||
|
||||
let mut slot_guard = slot_guard.into_inner();
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value()?;
|
||||
}
|
||||
Err(_barrier) => {
|
||||
slot_guard.revert();
|
||||
// this really should not happen, at all, unless shutdown was already going?
|
||||
anyhow::bail!("Cannot restart Tenant, already shutting down");
|
||||
let tenant = if resp.reset_tenant_required() {
|
||||
attempt.before_shutdown();
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value()?;
|
||||
}
|
||||
Err(_barrier) => {
|
||||
slot_guard.revert();
|
||||
// this really should not happen, at all, unless shutdown was already going?
|
||||
anyhow::bail!("Cannot restart Tenant, already shutting down");
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
|
||||
|
||||
let shard_identity = config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
Some(&attempt),
|
||||
ctx,
|
||||
);
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
|
||||
tenant
|
||||
} else {
|
||||
tracing::info!("skipping tenant_reset as no changes made required it");
|
||||
tenant
|
||||
};
|
||||
|
||||
if let Some(reparented) = resp.completed() {
|
||||
// finally ask the restarted tenant to complete the detach
|
||||
tenant
|
||||
.ongoing_timeline_detach
|
||||
.complete(attempt, &tenant)
|
||||
.await?;
|
||||
Ok(reparented)
|
||||
} else {
|
||||
// at least the latest versions have now been downloaded and refreshed; be ready to
|
||||
// retry another time.
|
||||
tenant.ongoing_timeline_detach.cancel(attempt);
|
||||
Err(anyhow::anyhow!(
|
||||
"failed to reparent all candidate timelines, please retry"
|
||||
))
|
||||
}
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
|
||||
|
||||
let shard_identity = config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
)?;
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
|
||||
Ok(reparented)
|
||||
}
|
||||
|
||||
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
||||
@@ -2050,6 +2120,7 @@ impl TenantManager {
|
||||
};
|
||||
|
||||
match selector {
|
||||
ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
|
||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||
return ShardResolveResult::Found(tenant.clone())
|
||||
}
|
||||
@@ -2131,9 +2202,6 @@ pub(crate) enum GetActiveTenantError {
|
||||
/// never happen.
|
||||
#[error("Tenant is broken: {0}")]
|
||||
Broken(String),
|
||||
|
||||
#[error("reconnect to switch tenant id")]
|
||||
SwitchedTenant,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
|
||||
@@ -187,7 +187,7 @@ use camino::Utf8Path;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
|
||||
pub(crate) use download::download_initdb_tar_zst;
|
||||
use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -457,17 +457,6 @@ impl RemoteTimelineClient {
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Returns whether the timeline is archived.
|
||||
/// Return None if the remote index_part hasn't been downloaded yet.
|
||||
pub(crate) fn is_archived(&self) -> Option<bool> {
|
||||
self.upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.initialized_mut()
|
||||
.map(|q| q.clean.0.archived_at.is_some())
|
||||
.ok()
|
||||
}
|
||||
|
||||
fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
|
||||
let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
|
||||
current_remote_index_part
|
||||
@@ -628,7 +617,7 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
|
||||
/// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
|
||||
pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
|
||||
self: &Arc<Self>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
@@ -639,48 +628,6 @@ impl RemoteTimelineClient {
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
|
||||
///
|
||||
/// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
|
||||
/// so either if the change is already sitting in the queue, but not commited yet, or the change has not
|
||||
/// been in the queue yet.
|
||||
pub(crate) fn schedule_index_upload_for_timeline_archival_state(
|
||||
self: &Arc<Self>,
|
||||
state: TimelineArchivalState,
|
||||
) -> anyhow::Result<bool> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
/// Returns Some(_) if a change is needed, and Some(true) if it's a
|
||||
/// change needed to set archived_at.
|
||||
fn need_change(
|
||||
archived_at: &Option<NaiveDateTime>,
|
||||
state: TimelineArchivalState,
|
||||
) -> Option<bool> {
|
||||
match (archived_at, state) {
|
||||
(Some(_), TimelineArchivalState::Archived)
|
||||
| (None, TimelineArchivalState::Unarchived) => {
|
||||
// Nothing to do
|
||||
tracing::info!("intended state matches present state");
|
||||
None
|
||||
}
|
||||
(None, TimelineArchivalState::Archived) => Some(true),
|
||||
(Some(_), TimelineArchivalState::Unarchived) => Some(false),
|
||||
}
|
||||
}
|
||||
let need_upload_scheduled = need_change(&upload_queue.dirty.archived_at, state);
|
||||
|
||||
if let Some(archived_at_set) = need_upload_scheduled {
|
||||
let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc());
|
||||
upload_queue.dirty.archived_at = intended_archived_at;
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
}
|
||||
|
||||
let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some();
|
||||
Ok(need_wait)
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch an index-file upload operation in the background, if necessary.
|
||||
///
|
||||
@@ -736,12 +683,13 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reparent this timeline to a new parent.
|
||||
///
|
||||
/// A retryable step of timeline ancestor detach.
|
||||
pub(crate) async fn schedule_reparenting_and_wait(
|
||||
self: &Arc<Self>,
|
||||
new_parent: &TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
// FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
|
||||
// and reads the in-memory part we cannot do the detaching like this
|
||||
let receiver = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
@@ -752,17 +700,29 @@ impl RemoteTimelineClient {
|
||||
));
|
||||
};
|
||||
|
||||
upload_queue.dirty.metadata.reparent(new_parent);
|
||||
upload_queue.dirty.lineage.record_previous_ancestor(&prev);
|
||||
let uploaded = &upload_queue.clean.0.metadata;
|
||||
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() {
|
||||
// nothing to do
|
||||
None
|
||||
} else {
|
||||
let mut modified = false;
|
||||
|
||||
self.schedule_barrier0(upload_queue)
|
||||
modified |= upload_queue.dirty.metadata.reparent(new_parent);
|
||||
modified |= upload_queue.dirty.lineage.record_previous_ancestor(&prev);
|
||||
|
||||
if modified {
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
}
|
||||
|
||||
Some(self.schedule_barrier0(upload_queue))
|
||||
}
|
||||
};
|
||||
|
||||
Self::wait_completion0(receiver)
|
||||
.await
|
||||
.context("wait completion")
|
||||
if let Some(receiver) = receiver {
|
||||
Self::wait_completion0(receiver).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Schedules uploading a new version of `index_part.json` with the given layers added,
|
||||
@@ -778,115 +738,99 @@ impl RemoteTimelineClient {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
|
||||
upload_queue.dirty.lineage.record_detaching(&adopted);
|
||||
if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) {
|
||||
None
|
||||
} else {
|
||||
let mut modified = false;
|
||||
modified |= upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
|
||||
modified |= upload_queue.dirty.lineage.record_detaching(&adopted);
|
||||
|
||||
for layer in layers {
|
||||
upload_queue
|
||||
.dirty
|
||||
.layer_metadata
|
||||
.insert(layer.layer_desc().layer_name(), layer.metadata());
|
||||
for layer in layers {
|
||||
let prev = upload_queue
|
||||
.dirty
|
||||
.layer_metadata
|
||||
.insert(layer.layer_desc().layer_name(), layer.metadata());
|
||||
modified |= prev.is_none();
|
||||
}
|
||||
|
||||
if modified {
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
}
|
||||
|
||||
Some(self.schedule_barrier0(upload_queue))
|
||||
}
|
||||
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
|
||||
let barrier = self.schedule_barrier0(upload_queue);
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
barrier
|
||||
};
|
||||
|
||||
Self::wait_completion0(barrier)
|
||||
.await
|
||||
.context("wait completion")
|
||||
if let Some(barrier) = barrier {
|
||||
Self::wait_completion0(barrier).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Adds a gc blocking reason for this timeline if one does not exist already.
|
||||
/// Marks timeline detach ancestor started for this timeline if it has not been marked as
|
||||
/// started.
|
||||
///
|
||||
/// A retryable step of timeline detach ancestor.
|
||||
/// A retryable step o ftimeline detach ancestor.
|
||||
///
|
||||
/// Returns a future which waits until the completion of the upload.
|
||||
pub(crate) fn schedule_insert_gc_block_reason(
|
||||
/// Does not overwrite or even error if the set of reparentable timelines differes. Those can
|
||||
/// be inspected later.
|
||||
///
|
||||
/// Waits until the completion of the upload.
|
||||
pub(crate) async fn schedule_started_detach_ancestor_mark_and_wait(
|
||||
self: &Arc<Self>,
|
||||
reason: index::GcBlockingReason,
|
||||
) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
let maybe_barrier = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if let index::GcBlockingReason::DetachAncestor = reason {
|
||||
if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
|
||||
drop(guard);
|
||||
panic!("cannot start detach ancestor if there is nothing to detach from");
|
||||
}
|
||||
fn wanted(x: Option<&index::GcBlocking>) -> bool {
|
||||
x.is_some_and(|b| b.blocked_by_detach_ancestor())
|
||||
}
|
||||
|
||||
let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
|
||||
|
||||
let current = upload_queue.dirty.gc_blocking.as_ref();
|
||||
let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
|
||||
|
||||
match (current, uploaded) {
|
||||
(x, y) if wanted(x) && wanted(y) => None,
|
||||
(x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
|
||||
// Usual case: !wanted(x) && !wanted(y)
|
||||
//
|
||||
// Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
|
||||
// turn on and off some reason.
|
||||
(x, y) => {
|
||||
if !wanted(x) && wanted(y) {
|
||||
// this could be avoided by having external in-memory synchronization, like
|
||||
// timeline detach ancestor
|
||||
warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
|
||||
}
|
||||
|
||||
_ => {
|
||||
// at this point, the metadata must always show that there is a parent
|
||||
if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
|
||||
panic!("cannot start detach ancestor if there is nothing to detach from");
|
||||
}
|
||||
upload_queue.dirty.gc_blocking = current
|
||||
.map(|x| x.with_reason(reason))
|
||||
.or_else(|| Some(index::GcBlocking::started_now_for(reason)));
|
||||
.map(|x| x.with_detach_ancestor())
|
||||
.or_else(|| Some(index::GcBlocking::started_now_for_detach_ancestor()));
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
Some(self.schedule_barrier0(upload_queue))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(async move {
|
||||
if let Some(barrier) = maybe_barrier {
|
||||
Self::wait_completion0(barrier).await?;
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
if let Some(barrier) = maybe_barrier {
|
||||
Self::wait_completion0(barrier).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Removes a gc blocking reason for this timeline if one exists.
|
||||
/// Marks timeline detach ancestor completed for this timeline if it has not been marked as
|
||||
/// such already.
|
||||
///
|
||||
/// A retryable step of timeline detach ancestor.
|
||||
/// ## Panics
|
||||
///
|
||||
/// Returns a future which waits until the completion of the upload.
|
||||
pub(crate) fn schedule_remove_gc_block_reason(
|
||||
/// If the timeline has not been detached from ancestor already.
|
||||
pub(crate) async fn schedule_completed_detach_ancestor_mark_and_wait(
|
||||
self: &Arc<Self>,
|
||||
reason: index::GcBlockingReason,
|
||||
) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
let maybe_barrier = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if let index::GcBlockingReason::DetachAncestor = reason {
|
||||
if !upload_queue
|
||||
.clean
|
||||
.0
|
||||
.lineage
|
||||
.is_detached_from_original_ancestor()
|
||||
{
|
||||
drop(guard);
|
||||
panic!("cannot complete timeline_ancestor_detach while not detached");
|
||||
}
|
||||
}
|
||||
assert!(upload_queue.clean.0.lineage.is_detached_from_ancestor());
|
||||
|
||||
let wanted = |x: Option<&index::GcBlocking>| {
|
||||
x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
|
||||
};
|
||||
fn wanted(x: Option<&index::GcBlocking>) -> bool {
|
||||
x.is_none() || x.is_some_and(|b| !b.blocked_by_detach_ancestor())
|
||||
}
|
||||
|
||||
let current = upload_queue.dirty.gc_blocking.as_ref();
|
||||
let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
|
||||
@@ -894,27 +838,21 @@ impl RemoteTimelineClient {
|
||||
match (current, uploaded) {
|
||||
(x, y) if wanted(x) && wanted(y) => None,
|
||||
(x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
|
||||
(x, y) => {
|
||||
if !wanted(x) && wanted(y) {
|
||||
warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
|
||||
}
|
||||
|
||||
upload_queue.dirty.gc_blocking =
|
||||
current.as_ref().and_then(|x| x.without_reason(reason));
|
||||
_ => {
|
||||
upload_queue.dirty.gc_blocking = current
|
||||
.expect("has to be Some because of wanted()")
|
||||
.without_detach_ancestor();
|
||||
assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
|
||||
// FIXME: bogus ?
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
Some(self.schedule_barrier0(upload_queue))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(async move {
|
||||
if let Some(barrier) = maybe_barrier {
|
||||
Self::wait_completion0(barrier).await?;
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
if let Some(barrier) = maybe_barrier {
|
||||
Self::wait_completion0(barrier).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an upload operation in the background; the file is added to be included in next
|
||||
@@ -1495,18 +1433,6 @@ impl RemoteTimelineClient {
|
||||
.dirty
|
||||
.layer_metadata
|
||||
.drain()
|
||||
.filter(|(_file_name, meta)| {
|
||||
// Filter out layers that belonged to an ancestor shard. Since we are deleting the whole timeline from
|
||||
// all shards anyway, we _could_ delete these, but
|
||||
// - it creates a potential race if other shards are still
|
||||
// using the layers while this shard deletes them.
|
||||
// - it means that if we rolled back the shard split, the ancestor shards would be in a state where
|
||||
// these timelines are present but corrupt (their index exists but some layers don't)
|
||||
//
|
||||
// These layers will eventually be cleaned up by the scrubber when it does physical GC.
|
||||
meta.shard.shard_number == self.tenant_shard_id.shard_number
|
||||
&& meta.shard.shard_count == self.tenant_shard_id.shard_count
|
||||
})
|
||||
.map(|(file_name, meta)| {
|
||||
remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
|
||||
@@ -32,10 +32,6 @@ pub struct IndexPart {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_at: Option<NaiveDateTime>,
|
||||
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub archived_at: Option<NaiveDateTime>,
|
||||
|
||||
/// Per layer file name metadata, which can be present for a present or missing layer file.
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
@@ -87,12 +83,11 @@ impl IndexPart {
|
||||
/// - 5: lineage was added
|
||||
/// - 6: last_aux_file_policy is added.
|
||||
/// - 7: metadata_bytes is no longer written, but still read
|
||||
/// - 8: added `archived_at`
|
||||
/// - 9: +gc_blocking
|
||||
const LATEST_VERSION: usize = 9;
|
||||
/// - 8: +gc_blocking
|
||||
const LATEST_VERSION: usize = 8;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -103,7 +98,6 @@ impl IndexPart {
|
||||
disk_consistent_lsn: metadata.disk_consistent_lsn(),
|
||||
metadata,
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
@@ -216,26 +210,45 @@ fn is_false(b: &bool) -> bool {
|
||||
impl Lineage {
|
||||
const REMEMBER_AT_MOST: usize = 100;
|
||||
|
||||
pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
|
||||
pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool {
|
||||
if self.reparenting_history.last() == Some(old_ancestor) {
|
||||
// do not re-record it
|
||||
return;
|
||||
}
|
||||
false
|
||||
} else {
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
let existing = self
|
||||
.reparenting_history
|
||||
.iter()
|
||||
.position(|x| x == old_ancestor);
|
||||
assert_eq!(
|
||||
existing, None,
|
||||
"we cannot reparent onto and off and onto the same timeline twice"
|
||||
);
|
||||
}
|
||||
let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
|
||||
|
||||
let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
|
||||
|
||||
self.reparenting_history_truncated |= drop_oldest;
|
||||
if drop_oldest {
|
||||
self.reparenting_history.remove(0);
|
||||
self.reparenting_history_truncated |= drop_oldest;
|
||||
if drop_oldest {
|
||||
self.reparenting_history.remove(0);
|
||||
}
|
||||
self.reparenting_history.push(*old_ancestor);
|
||||
true
|
||||
}
|
||||
self.reparenting_history.push(*old_ancestor);
|
||||
}
|
||||
|
||||
pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
|
||||
assert!(self.original_ancestor.is_none());
|
||||
|
||||
self.original_ancestor =
|
||||
Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
|
||||
/// Returns true if anything changed.
|
||||
pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool {
|
||||
if let Some((id, lsn, _)) = self.original_ancestor {
|
||||
assert_eq!(id, branchpoint.0);
|
||||
assert_eq!(lsn, branchpoint.1);
|
||||
false
|
||||
} else {
|
||||
assert!(self.original_ancestor.is_none());
|
||||
self.original_ancestor =
|
||||
Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
|
||||
@@ -247,70 +260,50 @@ impl Lineage {
|
||||
.is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
|
||||
}
|
||||
|
||||
pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
|
||||
/// Returns true if the timeline originally had an ancestor, and no longer has one.
|
||||
pub(crate) fn is_detached_from_ancestor(&self) -> bool {
|
||||
self.original_ancestor.is_some()
|
||||
}
|
||||
|
||||
/// Returns original ancestor timeline id and lsn that this timeline has been detached from.
|
||||
pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> {
|
||||
self.original_ancestor.map(|(id, lsn, _)| (id, lsn))
|
||||
}
|
||||
|
||||
pub(crate) fn is_reparented(&self) -> bool {
|
||||
!self.reparenting_history.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
/// Right now, the only reason to block gc persistently is detach_ancestor. To use gc blocking more
|
||||
/// broadly, a reason set field needs to be added, and the shared state load time building be
|
||||
/// complicated to avoid detach_ancestor clearing out a manually configured gc blocking.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub(crate) struct GcBlocking {
|
||||
pub(crate) started_at: NaiveDateTime,
|
||||
pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
|
||||
}
|
||||
|
||||
#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
|
||||
#[enumset(serialize_repr = "list")]
|
||||
pub(crate) enum GcBlockingReason {
|
||||
Manual,
|
||||
DetachAncestor,
|
||||
}
|
||||
|
||||
impl GcBlocking {
|
||||
pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
|
||||
pub(super) fn started_now_for_detach_ancestor() -> Self {
|
||||
GcBlocking {
|
||||
started_at: chrono::Utc::now().naive_utc(),
|
||||
reasons: enumset::EnumSet::only(reason),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the given reason is one of the reasons why the gc is blocked.
|
||||
pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
|
||||
self.reasons.contains(reason)
|
||||
/// Returns true if detach_ancestor is one of the reasons why the gc is blocked.
|
||||
pub(crate) fn blocked_by_detach_ancestor(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// Returns a version of self with the given reason.
|
||||
pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
|
||||
assert!(!self.blocked_by(reason));
|
||||
let mut reasons = self.reasons;
|
||||
reasons.insert(reason);
|
||||
|
||||
Self {
|
||||
started_at: self.started_at,
|
||||
reasons,
|
||||
}
|
||||
/// Returns a version of self with the reason of detach_ancestor.
|
||||
pub(super) fn with_detach_ancestor(&self) -> Self {
|
||||
self.clone()
|
||||
}
|
||||
|
||||
/// Returns a version of self without the given reason. Assumption is that if
|
||||
/// there are no more reasons, we can unblock the gc by returning `None`.
|
||||
pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
|
||||
assert!(self.blocked_by(reason));
|
||||
|
||||
if self.reasons.len() == 1 {
|
||||
None
|
||||
} else {
|
||||
let mut reasons = self.reasons;
|
||||
assert!(reasons.remove(reason));
|
||||
assert!(!reasons.is_empty());
|
||||
|
||||
Some(Self {
|
||||
started_at: self.started_at,
|
||||
reasons,
|
||||
})
|
||||
}
|
||||
/// Returns a version of self without the reason of detach_ancestor. Assumption is that if
|
||||
/// there are no more reasons, we can unblock the gc.
|
||||
pub(super) fn without_detach_ancestor(&self) -> Option<Self> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -353,7 +346,6 @@ mod tests {
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
@@ -397,7 +389,6 @@ mod tests {
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
@@ -442,7 +433,6 @@ mod tests {
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
@@ -490,7 +480,6 @@ mod tests {
|
||||
])
|
||||
.unwrap(),
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
@@ -533,7 +522,6 @@ mod tests {
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
@@ -575,7 +563,6 @@ mod tests {
|
||||
disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage {
|
||||
reparenting_history_truncated: false,
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
@@ -626,7 +613,6 @@ mod tests {
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Lineage {
|
||||
reparenting_history_truncated: false,
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
@@ -686,7 +672,6 @@ mod tests {
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: Default::default(),
|
||||
@@ -714,8 +699,9 @@ mod tests {
|
||||
"initdb_lsn": "0/1696070",
|
||||
"pg_version": 14
|
||||
},
|
||||
"deleted_at": "2023-07-31T09:00:00.123",
|
||||
"archived_at": "2023-04-29T09:00:00.123"
|
||||
"gc_blocking": {
|
||||
"started_at": "2024-07-19T09:00:00.123"
|
||||
}
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
@@ -742,73 +728,12 @@ mod tests {
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v9_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"version": 9,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata": {
|
||||
"disk_consistent_lsn": "0/16960E8",
|
||||
"prev_record_lsn": "0/1696070",
|
||||
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
|
||||
"ancestor_lsn": "0/0",
|
||||
"latest_gc_cutoff_lsn": "0/1696070",
|
||||
"initdb_lsn": "0/1696070",
|
||||
"pg_version": 14
|
||||
},
|
||||
"gc_blocking": {
|
||||
"started_at": "2024-07-19T09:00:00.123",
|
||||
"reasons": ["DetachAncestor"]
|
||||
}
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 9,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::new(
|
||||
Lsn::from_str("0/16960E8").unwrap(),
|
||||
Some(Lsn::from_str("0/1696070").unwrap()),
|
||||
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: Some(GcBlocking {
|
||||
started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
|
||||
reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
|
||||
}),
|
||||
last_aux_file_policy: Default::default(),
|
||||
archived_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
|
||||
@@ -8,9 +8,6 @@ mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
@@ -435,18 +432,39 @@ impl ReadableLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Return value from [`Layer::get_value_reconstruct_data`]
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum ValueReconstructResult {
|
||||
/// Got all the data needed to reconstruct the requested page
|
||||
Complete,
|
||||
/// This layer didn't contain all the required data, the caller should look up
|
||||
/// the predecessor layer at the returned LSN and collect more data from there.
|
||||
Continue,
|
||||
|
||||
/// This layer didn't contain data needed to reconstruct the page version at
|
||||
/// the returned LSN. This is usually considered an error, but might be OK
|
||||
/// in some circumstances.
|
||||
Missing,
|
||||
}
|
||||
|
||||
/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather
|
||||
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
|
||||
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should
|
||||
/// be used for cache management but not for correctness-critical checks.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum LayerVisibilityHint {
|
||||
#[derive(Default, Debug, Clone, PartialEq, Eq)]
|
||||
pub(crate) enum LayerVisibilityHint {
|
||||
/// A Visible layer might be read while serving a read, because there is not an image layer between it
|
||||
/// and a readable LSN (the tip of the branch or a child's branch point)
|
||||
Visible,
|
||||
/// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
|
||||
/// a branch or ephemeral endpoint at an LSN below the layer that covers this.
|
||||
#[allow(unused)]
|
||||
Covered,
|
||||
/// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
|
||||
/// in this state. Note that newly written layers may be called Visible immediately, this uninitialized
|
||||
/// state is for when existing layers are constructed while loading a timeline.
|
||||
#[default]
|
||||
Uninitialized,
|
||||
}
|
||||
|
||||
pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
|
||||
@@ -539,25 +557,19 @@ impl LayerAccessStats {
|
||||
self.record_residence_event_at(SystemTime::now())
|
||||
}
|
||||
|
||||
fn record_access_at(&self, now: SystemTime) -> bool {
|
||||
pub(crate) fn record_access_at(&self, now: SystemTime) {
|
||||
let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
|
||||
|
||||
// A layer which is accessed must be visible.
|
||||
mask |= 0x1 << Self::VISIBILITY_SHIFT;
|
||||
value |= 0x1 << Self::VISIBILITY_SHIFT;
|
||||
|
||||
let old_bits = self.write_bits(mask, value);
|
||||
!matches!(
|
||||
self.decode_visibility(old_bits),
|
||||
LayerVisibilityHint::Visible
|
||||
)
|
||||
self.write_bits(mask, value);
|
||||
}
|
||||
|
||||
/// Returns true if we modified the layer's visibility to set it to Visible implicitly
|
||||
/// as a result of this access
|
||||
pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
|
||||
pub(crate) fn record_access(&self, ctx: &RequestContext) {
|
||||
if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
|
||||
return false;
|
||||
return;
|
||||
}
|
||||
|
||||
self.record_access_at(SystemTime::now())
|
||||
@@ -614,29 +626,22 @@ impl LayerAccessStats {
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper for extracting the visibility hint from the literal value of our inner u64
|
||||
fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
|
||||
match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
|
||||
1 => LayerVisibilityHint::Visible,
|
||||
0 => LayerVisibilityHint::Covered,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the old value which has been replaced
|
||||
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
|
||||
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
|
||||
let value = match visibility {
|
||||
LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
|
||||
LayerVisibilityHint::Covered => 0x0,
|
||||
LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
|
||||
};
|
||||
|
||||
let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
|
||||
self.decode_visibility(old_bits)
|
||||
self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
|
||||
}
|
||||
|
||||
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
|
||||
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
|
||||
self.decode_visibility(read)
|
||||
match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
|
||||
1 => LayerVisibilityHint::Visible,
|
||||
0 => LayerVisibilityHint::Covered,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -36,12 +36,13 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
@@ -71,7 +72,10 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
|
||||
use super::{
|
||||
AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -196,6 +200,7 @@ impl DeltaKey {
|
||||
pub struct DeltaLayer {
|
||||
path: Utf8PathBuf,
|
||||
pub desc: PersistentLayerDesc,
|
||||
access_stats: LayerAccessStats,
|
||||
inner: OnceCell<Arc<DeltaLayerInner>>,
|
||||
}
|
||||
|
||||
@@ -294,6 +299,7 @@ impl DeltaLayer {
|
||||
/// not loaded already.
|
||||
///
|
||||
async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
|
||||
self.access_stats.record_access(ctx);
|
||||
// Quick exit if already loaded
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner(ctx))
|
||||
@@ -301,10 +307,12 @@ impl DeltaLayer {
|
||||
.with_context(|| format!("Failed to load delta layer {}", self.path()))
|
||||
}
|
||||
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result<Arc<DeltaLayerInner>> {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?;
|
||||
let loaded = DeltaLayerInner::load(&path, None, None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
|
||||
// not production code
|
||||
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
|
||||
@@ -344,6 +352,7 @@ impl DeltaLayer {
|
||||
summary.lsn_range,
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: Default::default(),
|
||||
inner: OnceCell::new(),
|
||||
})
|
||||
}
|
||||
@@ -366,6 +375,7 @@ impl DeltaLayer {
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
struct DeltaLayerWriterInner {
|
||||
conf: &'static PageServerConf,
|
||||
pub path: Utf8PathBuf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -376,9 +386,6 @@ struct DeltaLayerWriterInner {
|
||||
tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
|
||||
|
||||
blob_writer: BlobWriter<true>,
|
||||
|
||||
// Number of key-lsns in the layer.
|
||||
num_keys: usize,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriterInner {
|
||||
@@ -412,6 +419,7 @@ impl DeltaLayerWriterInner {
|
||||
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
path,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
@@ -419,7 +427,6 @@ impl DeltaLayerWriterInner {
|
||||
lsn_range,
|
||||
tree: tree_builder,
|
||||
blob_writer,
|
||||
num_keys: 0,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -462,7 +469,7 @@ impl DeltaLayerWriterInner {
|
||||
.write_blob_maybe_compressed(val, ctx, compression)
|
||||
.await;
|
||||
let off = match res {
|
||||
Ok((off, _)) => off,
|
||||
Ok(off) => off,
|
||||
Err(e) => return (val, Err(anyhow::anyhow!(e))),
|
||||
};
|
||||
|
||||
@@ -470,9 +477,6 @@ impl DeltaLayerWriterInner {
|
||||
|
||||
let delta_key = DeltaKey::from_key_lsn(&key, lsn);
|
||||
let res = self.tree.append(&delta_key.0, blob_ref.0);
|
||||
|
||||
self.num_keys += 1;
|
||||
|
||||
(val, res.map_err(|e| anyhow::anyhow!(e)))
|
||||
}
|
||||
|
||||
@@ -486,10 +490,11 @@ impl DeltaLayerWriterInner {
|
||||
async fn finish(
|
||||
self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let temp_path = self.path.clone();
|
||||
let result = self.finish0(key_end, ctx).await;
|
||||
let result = self.finish0(key_end, timeline, ctx).await;
|
||||
if result.is_err() {
|
||||
tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
@@ -502,8 +507,9 @@ impl DeltaLayerWriterInner {
|
||||
async fn finish0(
|
||||
self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -568,9 +574,11 @@ impl DeltaLayerWriterInner {
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
|
||||
trace!("created delta layer {}", self.path);
|
||||
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
|
||||
|
||||
Ok((desc, self.path))
|
||||
trace!("created delta layer {}", layer.local_path());
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -671,20 +679,14 @@ impl DeltaLayerWriter {
|
||||
pub(crate) async fn finish(
|
||||
mut self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
self.inner.take().unwrap().finish(key_end, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn num_keys(&self) -> usize {
|
||||
self.inner.as_ref().unwrap().num_keys
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
self.inner
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(key_end, timeline, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -758,24 +760,27 @@ impl DeltaLayerInner {
|
||||
&self.layer_lsn_range
|
||||
}
|
||||
|
||||
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
||||
/// - inner has the success or transient failure
|
||||
/// - outer has the permanent failure
|
||||
pub(super) async fn load(
|
||||
path: &Utf8Path,
|
||||
summary: Option<Summary>,
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?;
|
||||
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path, ctx).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
let file_id = page_cache::next_file_id();
|
||||
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
|
||||
let summary_blk = block_reader
|
||||
.read_blk(0, ctx)
|
||||
.await
|
||||
.context("read first block")?;
|
||||
let summary_blk = match block_reader.read_blk(0, ctx).await {
|
||||
Ok(blk) => blk,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||
};
|
||||
|
||||
// TODO: this should be an assertion instead; see ImageLayerInner::load
|
||||
let actual_summary =
|
||||
@@ -797,7 +802,7 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(DeltaLayerInner {
|
||||
Ok(Ok(DeltaLayerInner {
|
||||
file,
|
||||
file_id,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
@@ -805,7 +810,96 @@ impl DeltaLayerInner {
|
||||
max_vectored_read_bytes,
|
||||
layer_key_range: actual_summary.key_range,
|
||||
layer_lsn_range: actual_summary.lsn_range,
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let mut need_image = true;
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
&block_reader,
|
||||
);
|
||||
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
|
||||
|
||||
tree_reader
|
||||
.visit(
|
||||
&search_key.0,
|
||||
VisitDirection::Backwards,
|
||||
|key, value| {
|
||||
let blob_ref = BlobRef(value);
|
||||
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
!blob_ref.will_init()
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let ctx = &RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerValue)
|
||||
.build();
|
||||
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let cursor = block_reader.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor
|
||||
.read_blob_into_buf(pos, &mut buf, ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to read blob from virtual file {}", self.file.path)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
self.file.path
|
||||
)
|
||||
})?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
@@ -1580,9 +1674,8 @@ pub(crate) mod test {
|
||||
use super::*;
|
||||
use crate::repository::Value;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
use crate::tenant::Tenant;
|
||||
use crate::{
|
||||
context::DownloadBehavior,
|
||||
task_mgr::TaskKind,
|
||||
@@ -1876,8 +1969,9 @@ pub(crate) mod test {
|
||||
res?;
|
||||
}
|
||||
|
||||
let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
|
||||
let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;
|
||||
let resident = writer
|
||||
.finish(entries_meta.key_range.end, &timeline, &ctx)
|
||||
.await?;
|
||||
|
||||
let inner = resident.get_as_delta(&ctx).await?;
|
||||
|
||||
@@ -1957,7 +2051,6 @@ pub(crate) mod test {
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.next()
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
{
|
||||
@@ -2032,8 +2125,7 @@ pub(crate) mod test {
|
||||
.read()
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.find(|&x| x != &initdb_layer)
|
||||
.cloned()
|
||||
.find(|x| x != &initdb_layer)
|
||||
.unwrap();
|
||||
|
||||
// create a copy for the timeline, so we don't overwrite the file
|
||||
@@ -2068,8 +2160,7 @@ pub(crate) mod test {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
|
||||
let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();
|
||||
let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
|
||||
|
||||
copied_layer.get_as_delta(ctx).await.unwrap();
|
||||
|
||||
@@ -2197,9 +2288,7 @@ pub(crate) mod test {
|
||||
for (key, lsn, value) in deltas {
|
||||
writer.put_value(key, lsn, value, ctx).await?;
|
||||
}
|
||||
|
||||
let (desc, path) = writer.finish(key_end, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
|
||||
let delta_layer = writer.finish(key_end, tline, ctx).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(delta_layer)
|
||||
}
|
||||
|
||||
@@ -32,6 +32,9 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::{
|
||||
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
@@ -134,6 +137,7 @@ pub struct ImageLayer {
|
||||
pub desc: PersistentLayerDesc,
|
||||
// This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
|
||||
pub lsn: Lsn,
|
||||
access_stats: LayerAccessStats,
|
||||
inner: OnceCell<ImageLayerInner>,
|
||||
}
|
||||
|
||||
@@ -251,6 +255,7 @@ impl ImageLayer {
|
||||
/// not loaded already.
|
||||
///
|
||||
async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
|
||||
self.access_stats.record_access(ctx);
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner(ctx))
|
||||
.await
|
||||
@@ -260,8 +265,9 @@ impl ImageLayer {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded =
|
||||
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?;
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
|
||||
// not production code
|
||||
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
|
||||
@@ -301,6 +307,7 @@ impl ImageLayer {
|
||||
metadata.len(),
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: summary.lsn,
|
||||
access_stats: Default::default(),
|
||||
inner: OnceCell::new(),
|
||||
})
|
||||
}
|
||||
@@ -378,16 +385,17 @@ impl ImageLayerInner {
|
||||
summary: Option<Summary>,
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?;
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path, ctx).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = block_reader
|
||||
.read_blk(0, ctx)
|
||||
.await
|
||||
.context("read first block")?;
|
||||
let summary_blk = match block_reader.read_blk(0, ctx).await {
|
||||
Ok(blk) => blk,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||
};
|
||||
|
||||
// length is the only way how this could fail, so it's not actually likely at all unless
|
||||
// read_blk returns wrong sized block.
|
||||
@@ -412,7 +420,7 @@ impl ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ImageLayerInner {
|
||||
Ok(Ok(ImageLayerInner {
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
lsn,
|
||||
@@ -420,7 +428,47 @@ impl ImageLayerInner {
|
||||
file_id,
|
||||
max_vectored_read_bytes,
|
||||
key_range: actual_summary.key_range,
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader
|
||||
.get(
|
||||
&keybuf,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await?
|
||||
{
|
||||
let blob = block_reader
|
||||
.block_cursor()
|
||||
.read_blob(
|
||||
offset,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerValue)
|
||||
.build(),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("failed to read value from offset {}", offset))?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
@@ -688,29 +736,11 @@ struct ImageLayerWriterInner {
|
||||
// Total uncompressed bytes passed into put_image
|
||||
uncompressed_bytes: u64,
|
||||
|
||||
// Like `uncompressed_bytes`,
|
||||
// but only of images we might consider for compression
|
||||
uncompressed_bytes_eligible: u64,
|
||||
|
||||
// Like `uncompressed_bytes`, but only of images
|
||||
// where we have chosen their compressed form
|
||||
uncompressed_bytes_chosen: u64,
|
||||
|
||||
// Number of keys in the layer.
|
||||
num_keys: usize,
|
||||
|
||||
blob_writer: BlobWriter<false>,
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
|
||||
#[cfg_attr(not(feature = "testing"), allow(dead_code))]
|
||||
last_written_key: Key,
|
||||
}
|
||||
|
||||
impl ImageLayerWriterInner {
|
||||
fn size(&self) -> u64 {
|
||||
self.tree.borrow_writer().size() + self.blob_writer.size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
@@ -762,10 +792,6 @@ impl ImageLayerWriterInner {
|
||||
tree: tree_builder,
|
||||
blob_writer,
|
||||
uncompressed_bytes: 0,
|
||||
uncompressed_bytes_eligible: 0,
|
||||
uncompressed_bytes_chosen: 0,
|
||||
num_keys: 0,
|
||||
last_written_key: Key::MIN,
|
||||
};
|
||||
|
||||
Ok(writer)
|
||||
@@ -784,33 +810,18 @@ impl ImageLayerWriterInner {
|
||||
) -> anyhow::Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let compression = self.conf.image_compression;
|
||||
let uncompressed_len = img.len() as u64;
|
||||
self.uncompressed_bytes += uncompressed_len;
|
||||
self.num_keys += 1;
|
||||
self.uncompressed_bytes += img.len() as u64;
|
||||
let (_img, res) = self
|
||||
.blob_writer
|
||||
.write_blob_maybe_compressed(img, ctx, compression)
|
||||
.await;
|
||||
// TODO: re-use the buffer for `img` further upstack
|
||||
let (off, compression_info) = res?;
|
||||
if compression_info.compressed_size.is_some() {
|
||||
// The image has been considered for compression at least
|
||||
self.uncompressed_bytes_eligible += uncompressed_len;
|
||||
}
|
||||
if compression_info.written_compressed {
|
||||
// The image has been compressed
|
||||
self.uncompressed_bytes_chosen += uncompressed_len;
|
||||
}
|
||||
let off = res?;
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
self.tree.append(&keybuf, off)?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
self.last_written_key = key;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -821,7 +832,6 @@ impl ImageLayerWriterInner {
|
||||
self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Option<Key>,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
@@ -829,9 +839,6 @@ impl ImageLayerWriterInner {
|
||||
// Calculate compression ratio
|
||||
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
|
||||
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
|
||||
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
|
||||
.inc_by(self.uncompressed_bytes_eligible);
|
||||
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
|
||||
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
|
||||
|
||||
let mut file = self.blob_writer.into_inner();
|
||||
@@ -872,23 +879,11 @@ impl ImageLayerWriterInner {
|
||||
let desc = PersistentLayerDesc::new_img(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
if let Some(end_key) = end_key {
|
||||
self.key_range.start..end_key
|
||||
} else {
|
||||
self.key_range.clone()
|
||||
},
|
||||
self.key_range.clone(),
|
||||
self.lsn,
|
||||
metadata.len(),
|
||||
);
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
if let Some(end_key) = end_key {
|
||||
assert!(
|
||||
self.last_written_key < end_key,
|
||||
"written key violates end_key range"
|
||||
);
|
||||
}
|
||||
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
@@ -965,18 +960,6 @@ impl ImageLayerWriter {
|
||||
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Estimated size of the image layer.
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn num_keys(&self) -> usize {
|
||||
self.inner.as_ref().unwrap().num_keys
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
@@ -985,26 +968,7 @@ impl ImageLayerWriter {
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<super::ResidentLayer> {
|
||||
self.inner.take().unwrap().finish(timeline, ctx, None).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
|
||||
pub(super) async fn finish_with_end_key(
|
||||
mut self,
|
||||
timeline: &Arc<Timeline>,
|
||||
end_key: Key,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<super::ResidentLayer> {
|
||||
self.inner
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(timeline, ctx, Some(end_key))
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> u64 {
|
||||
self.inner.as_ref().unwrap().size()
|
||||
self.inner.take().unwrap().finish(timeline, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,11 +10,11 @@ use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::ValueReconstructResult;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::{l0_flush, page_cache, walrecord};
|
||||
use anyhow::{anyhow, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use anyhow::{anyhow, ensure, Result};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -34,7 +34,8 @@ use std::sync::atomic::{AtomicU64, AtomicUsize};
|
||||
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use super::{
|
||||
DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
|
||||
DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
|
||||
@@ -54,6 +55,9 @@ pub struct InMemoryLayer {
|
||||
/// Writes are only allowed when this is `None`.
|
||||
pub(crate) end_lsn: OnceLock<Lsn>,
|
||||
|
||||
/// Used for traversal path. Cached representation of the in-memory layer before frozen.
|
||||
local_path_str: Arc<str>,
|
||||
|
||||
/// Used for traversal path. Cached representation of the in-memory layer after frozen.
|
||||
frozen_local_path_str: OnceLock<Arc<str>>,
|
||||
|
||||
@@ -244,6 +248,12 @@ impl InMemoryLayer {
|
||||
self.start_lsn..self.end_lsn_or_max()
|
||||
}
|
||||
|
||||
pub(crate) fn local_path_str(&self) -> &Arc<str> {
|
||||
self.frozen_local_path_str
|
||||
.get()
|
||||
.unwrap_or(&self.local_path_str)
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
///
|
||||
/// this is likely completly unused
|
||||
@@ -293,6 +303,60 @@ impl InMemoryLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let reader = inner.file.block_cursor();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos, &ctx).await?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
//
|
||||
@@ -385,17 +449,20 @@ impl InMemoryLayer {
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
start_lsn: Lsn,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||
|
||||
let file =
|
||||
EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
|
||||
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
|
||||
let key = InMemoryLayerFileId(file.page_cache_file_id());
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
file_id: key,
|
||||
local_path_str: {
|
||||
let mut buf = String::new();
|
||||
inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
|
||||
buf.into()
|
||||
},
|
||||
frozen_local_path_str: OnceLock::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
@@ -415,7 +482,8 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub async fn put_value(
|
||||
|
||||
pub(crate) async fn put_value(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
@@ -480,6 +548,8 @@ impl InMemoryLayer {
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is exclusive
|
||||
pub async fn freeze(&self, end_lsn: Lsn) {
|
||||
let inner = self.inner.write().await;
|
||||
|
||||
assert!(
|
||||
self.start_lsn < end_lsn,
|
||||
"{} >= {}",
|
||||
@@ -497,13 +567,9 @@ impl InMemoryLayer {
|
||||
})
|
||||
.expect("frozen_local_path_str set only once");
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
let inner = self.inner.write().await;
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
}
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -513,12 +579,12 @@ impl InMemoryLayer {
|
||||
/// if there are no matching keys.
|
||||
///
|
||||
/// Returns a new delta layer with all the same data as this in-memory layer
|
||||
pub async fn write_to_disk(
|
||||
pub(crate) async fn write_to_disk(
|
||||
&self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
key_range: Option<Range<Key>>,
|
||||
l0_flush_global_state: &l0_flush::Inner,
|
||||
) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
|
||||
) -> Result<Option<ResidentLayer>> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
@@ -530,8 +596,9 @@ impl InMemoryLayer {
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
|
||||
use l0_flush::Inner;
|
||||
let _concurrency_permit = match l0_flush_global_state {
|
||||
let _concurrency_permit = match &*l0_flush_global_state {
|
||||
Inner::PageCached => None,
|
||||
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
|
||||
};
|
||||
@@ -561,7 +628,7 @@ impl InMemoryLayer {
|
||||
)
|
||||
.await?;
|
||||
|
||||
match l0_flush_global_state {
|
||||
match &*l0_flush_global_state {
|
||||
l0_flush::Inner::PageCached => {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
@@ -626,7 +693,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
// MAX is used here because we identify L0 layers by full key range
|
||||
let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
|
||||
|
||||
// Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
|
||||
//
|
||||
@@ -638,6 +705,6 @@ impl InMemoryLayer {
|
||||
// we dirtied when writing to the filesystem have been flushed and marked !dirty.
|
||||
drop(_concurrency_permit);
|
||||
|
||||
Ok(Some((desc, path)))
|
||||
Ok(Some(delta_layer))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
|
||||
use super::image_layer::{self};
|
||||
use super::{
|
||||
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
|
||||
LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
|
||||
PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
|
||||
};
|
||||
|
||||
use utils::generation::Generation;
|
||||
@@ -246,7 +246,7 @@ impl Layer {
|
||||
&timeline.generation,
|
||||
);
|
||||
|
||||
LayerInner::new(
|
||||
let layer = LayerInner::new(
|
||||
conf,
|
||||
timeline,
|
||||
local_path,
|
||||
@@ -254,7 +254,14 @@ impl Layer {
|
||||
Some(inner),
|
||||
timeline.generation,
|
||||
timeline.get_shard_index(),
|
||||
)
|
||||
);
|
||||
|
||||
// Newly created layers are marked visible by default: the usual case is that they were created to be read.
|
||||
layer
|
||||
.access_stats
|
||||
.set_visibility(super::LayerVisibilityHint::Visible);
|
||||
|
||||
layer
|
||||
}));
|
||||
|
||||
let downloaded = resident.expect("just initialized");
|
||||
@@ -300,6 +307,42 @@ impl Layer {
|
||||
self.0.delete_on_drop();
|
||||
}
|
||||
|
||||
/// Return data needed to reconstruct given page at LSN.
|
||||
///
|
||||
/// It is up to the caller to collect more data from the previous layer and
|
||||
/// perform WAL redo, if necessary.
|
||||
///
|
||||
/// # Cancellation-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
use anyhow::ensure;
|
||||
|
||||
let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
|
||||
self.0.access_stats.record_access(ctx);
|
||||
|
||||
if self.layer_desc().is_delta {
|
||||
ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
|
||||
ensure!(self.layer_desc().key_range.contains(&key));
|
||||
} else {
|
||||
ensure!(self.layer_desc().key_range.contains(&key));
|
||||
ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
|
||||
ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
|
||||
}
|
||||
|
||||
layer
|
||||
.get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
|
||||
.instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
|
||||
.await
|
||||
.with_context(|| format!("get_value_reconstruct_data for layer {self}"))
|
||||
}
|
||||
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
@@ -316,7 +359,7 @@ impl Layer {
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
|
||||
self.record_access(ctx);
|
||||
self.0.access_stats.record_access(ctx);
|
||||
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
|
||||
@@ -396,18 +439,18 @@ impl Layer {
|
||||
self.0.info(reset)
|
||||
}
|
||||
|
||||
pub(crate) fn latest_activity(&self) -> SystemTime {
|
||||
self.0.access_stats.latest_activity()
|
||||
}
|
||||
|
||||
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
|
||||
self.0.access_stats.visibility()
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
&self.0.access_stats
|
||||
}
|
||||
|
||||
pub(crate) fn local_path(&self) -> &Utf8Path {
|
||||
&self.0.path
|
||||
}
|
||||
|
||||
pub(crate) fn debug_str(&self) -> &Arc<str> {
|
||||
&self.0.debug_str
|
||||
}
|
||||
|
||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||
self.0.metadata()
|
||||
}
|
||||
@@ -450,57 +493,13 @@ impl Layer {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn record_access(&self, ctx: &RequestContext) {
|
||||
if self.0.access_stats.record_access(ctx) {
|
||||
// Visibility was modified to Visible
|
||||
tracing::info!(
|
||||
"Layer {} became visible as a result of access",
|
||||
self.0.desc.key()
|
||||
);
|
||||
if let Some(tl) = self.0.timeline.upgrade() {
|
||||
tl.metrics
|
||||
.visible_physical_size_gauge
|
||||
.add(self.0.desc.file_size)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
|
||||
let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
|
||||
use LayerVisibilityHint::*;
|
||||
match (old_visibility, visibility) {
|
||||
(Visible, Covered) => {
|
||||
// Subtract this layer's contribution to the visible size metric
|
||||
if let Some(tl) = self.0.timeline.upgrade() {
|
||||
debug_assert!(
|
||||
tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
|
||||
);
|
||||
tl.metrics
|
||||
.visible_physical_size_gauge
|
||||
.sub(self.0.desc.file_size)
|
||||
}
|
||||
}
|
||||
(Covered, Visible) => {
|
||||
// Add this layer's contribution to the visible size metric
|
||||
if let Some(tl) = self.0.timeline.upgrade() {
|
||||
tl.metrics
|
||||
.visible_physical_size_gauge
|
||||
.add(self.0.desc.file_size)
|
||||
}
|
||||
}
|
||||
(Covered, Covered) | (Visible, Visible) => {
|
||||
// no change
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
|
||||
///
|
||||
/// However when we want something evicted, we cannot evict it right away as there might be current
|
||||
/// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
|
||||
/// read with [`Layer::get_values_reconstruct_data`].
|
||||
/// read with [`Layer::get_value_reconstruct_data`].
|
||||
///
|
||||
/// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
|
||||
#[derive(Debug)]
|
||||
@@ -581,6 +580,9 @@ struct LayerInner {
|
||||
/// Full path to the file; unclear if this should exist anymore.
|
||||
path: Utf8PathBuf,
|
||||
|
||||
/// String representation of the layer, used for traversal id.
|
||||
debug_str: Arc<str>,
|
||||
|
||||
desc: PersistentLayerDesc,
|
||||
|
||||
/// Timeline access is needed for remote timeline client and metrics.
|
||||
@@ -691,16 +693,6 @@ impl Drop for LayerInner {
|
||||
timeline.metrics.layer_count_image.dec();
|
||||
timeline.metrics.layer_size_image.sub(self.desc.file_size);
|
||||
}
|
||||
|
||||
if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
|
||||
debug_assert!(
|
||||
timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
|
||||
);
|
||||
timeline
|
||||
.metrics
|
||||
.visible_physical_size_gauge
|
||||
.sub(self.desc.file_size);
|
||||
}
|
||||
}
|
||||
|
||||
if !*self.wanted_deleted.get_mut() {
|
||||
@@ -809,14 +801,11 @@ impl LayerInner {
|
||||
timeline.metrics.layer_size_image.add(desc.file_size);
|
||||
}
|
||||
|
||||
// New layers are visible by default. This metric is later updated on drop or in set_visibility
|
||||
timeline
|
||||
.metrics
|
||||
.visible_physical_size_gauge
|
||||
.add(desc.file_size);
|
||||
|
||||
LayerInner {
|
||||
conf,
|
||||
debug_str: {
|
||||
format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
|
||||
},
|
||||
path: local_path,
|
||||
desc,
|
||||
timeline: Arc::downgrade(timeline),
|
||||
@@ -1662,9 +1651,8 @@ impl Drop for DownloadedLayer {
|
||||
}
|
||||
|
||||
impl DownloadedLayer {
|
||||
/// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`].
|
||||
/// Failure to load the layer is sticky, i.e., future `get()` calls will return
|
||||
/// the initial load failure immediately.
|
||||
/// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to
|
||||
/// initialize it permanently.
|
||||
///
|
||||
/// `owner` parameter is a strong reference at the same `LayerInner` as the
|
||||
/// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
|
||||
@@ -1695,7 +1683,7 @@ impl DownloadedLayer {
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map(LayerKind::Delta)
|
||||
.map(|res| res.map(LayerKind::Delta))
|
||||
} else {
|
||||
let lsn = owner.desc.image_layer_lsn();
|
||||
let summary = Some(image_layer::Summary::expected(
|
||||
@@ -1712,29 +1700,54 @@ impl DownloadedLayer {
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map(LayerKind::Image)
|
||||
.map(|res| res.map(LayerKind::Image))
|
||||
};
|
||||
|
||||
match res {
|
||||
Ok(layer) => Ok(layer),
|
||||
Err(err) => {
|
||||
Ok(Ok(layer)) => Ok(Ok(layer)),
|
||||
Ok(Err(transient)) => Err(transient),
|
||||
Err(permanent) => {
|
||||
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
|
||||
// We log this message once over the lifetime of `Self`
|
||||
// => Ok and good to log backtrace and path here.
|
||||
tracing::error!(
|
||||
"layer load failed, assuming permanent failure: {}: {err:?}",
|
||||
owner.path
|
||||
);
|
||||
Err(err)
|
||||
// TODO(#5815): we are not logging all errors, so temporarily log them **once**
|
||||
// here as well
|
||||
let permanent = permanent.context("load layer");
|
||||
tracing::error!("layer loading failed permanently: {permanent:#}");
|
||||
Ok(Err(permanent))
|
||||
}
|
||||
}
|
||||
};
|
||||
self.kind
|
||||
.get_or_init(init)
|
||||
.await
|
||||
.get_or_try_init(init)
|
||||
// return transient errors using `?`
|
||||
.await?
|
||||
.as_ref()
|
||||
// We already logged the full backtrace above, once. Don't repeat that here.
|
||||
.map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
|
||||
.map_err(|e| {
|
||||
// errors are not clonabled, cannot but stringify
|
||||
// test_broken_timeline matches this string
|
||||
anyhow::anyhow!("layer loading failed: {e:#}")
|
||||
})
|
||||
}
|
||||
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
use LayerKind::*;
|
||||
|
||||
match self.get(owner, ctx).await? {
|
||||
Delta(d) => {
|
||||
d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
Image(i) => {
|
||||
i.get_value_reconstruct_data(key, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_values_reconstruct_data(
|
||||
@@ -1747,11 +1760,7 @@ impl DownloadedLayer {
|
||||
) -> Result<(), GetVectoredError> {
|
||||
use LayerKind::*;
|
||||
|
||||
match self
|
||||
.get(owner, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?
|
||||
{
|
||||
match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
|
||||
Delta(d) => {
|
||||
d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
|
||||
.await
|
||||
@@ -1835,7 +1844,7 @@ impl ResidentLayer {
|
||||
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
||||
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
||||
// while it's being held.
|
||||
self.owner.record_access(ctx);
|
||||
owner.access_stats.record_access(ctx);
|
||||
|
||||
delta_layer::DeltaLayerInner::load_keys(d, ctx)
|
||||
.await
|
||||
|
||||
@@ -39,7 +39,7 @@ async fn smoke_test() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -50,26 +50,13 @@ async fn smoke_test() {
|
||||
// all layers created at pageserver are like `layer`, initialized with strong
|
||||
// Arc<DownloadedLayer>.
|
||||
|
||||
let controlfile_keyspace = KeySpace {
|
||||
ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
|
||||
};
|
||||
|
||||
let img_before = {
|
||||
let mut data = ValuesReconstructState::default();
|
||||
let mut data = ValueReconstructState::default();
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
controlfile_keyspace.clone(),
|
||||
Lsn(0x10)..Lsn(0x11),
|
||||
&mut data,
|
||||
&ctx,
|
||||
)
|
||||
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
data.keys
|
||||
.remove(&CONTROLFILE_KEY)
|
||||
.expect("must be present")
|
||||
.expect("should not error")
|
||||
.img
|
||||
data.img
|
||||
.take()
|
||||
.expect("tenant harness writes the control file")
|
||||
};
|
||||
@@ -87,24 +74,13 @@ async fn smoke_test() {
|
||||
|
||||
// on accesses when the layer is evicted, it will automatically be downloaded.
|
||||
let img_after = {
|
||||
let mut data = ValuesReconstructState::default();
|
||||
let mut data = ValueReconstructState::default();
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
controlfile_keyspace.clone(),
|
||||
Lsn(0x10)..Lsn(0x11),
|
||||
&mut data,
|
||||
&ctx,
|
||||
)
|
||||
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
|
||||
.instrument(download_span.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
data.keys
|
||||
.remove(&CONTROLFILE_KEY)
|
||||
.expect("must be present")
|
||||
.expect("should not error")
|
||||
.img
|
||||
.take()
|
||||
.expect("tenant harness writes the control file")
|
||||
data.img.take().unwrap()
|
||||
};
|
||||
|
||||
assert_eq!(img_before, img_after);
|
||||
@@ -176,7 +152,7 @@ async fn smoke_test() {
|
||||
{
|
||||
let layers = &[layer];
|
||||
let mut g = timeline.layers.write().await;
|
||||
g.open_mut().unwrap().finish_gc_timeline(layers);
|
||||
g.finish_gc_timeline(layers);
|
||||
// this just updates the remote_physical_size for demonstration purposes
|
||||
rtc.schedule_gc_update(layers).unwrap();
|
||||
}
|
||||
@@ -216,7 +192,7 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -260,7 +236,7 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
// the deletion of the layer in remote_storage happens.
|
||||
{
|
||||
let mut layers = timeline.layers.write().await;
|
||||
layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
|
||||
layers.finish_gc_timeline(&[layer]);
|
||||
}
|
||||
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
|
||||
@@ -301,7 +277,7 @@ fn read_wins_pending_eviction() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -433,7 +409,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -602,7 +578,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -682,7 +658,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -801,9 +777,9 @@ async fn eviction_cancellation_on_drop() {
|
||||
let (evicted_layer, not_evicted) = {
|
||||
let mut layers = {
|
||||
let mut guard = timeline.layers.write().await;
|
||||
let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
|
||||
let layers = guard.likely_resident_layers().collect::<Vec<_>>();
|
||||
// remove the layers from layermap
|
||||
guard.open_mut().unwrap().finish_gc_timeline(&layers);
|
||||
guard.finish_gc_timeline(&layers);
|
||||
|
||||
layers
|
||||
};
|
||||
@@ -852,9 +828,9 @@ async fn eviction_cancellation_on_drop() {
|
||||
#[test]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn layer_size() {
|
||||
assert_eq!(size_of::<LayerAccessStats>(), 8);
|
||||
assert_eq!(size_of::<PersistentLayerDesc>(), 104);
|
||||
assert_eq!(size_of::<LayerInner>(), 296);
|
||||
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 8);
|
||||
assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
|
||||
assert_eq!(std::mem::size_of::<LayerInner>(), 312);
|
||||
// it also has the utf8 path
|
||||
}
|
||||
|
||||
|
||||
@@ -41,20 +41,6 @@ pub struct PersistentLayerKey {
|
||||
pub is_delta: bool,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for PersistentLayerKey {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}..{} {}..{} is_delta={}",
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end,
|
||||
self.is_delta
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayerDesc {
|
||||
pub fn key(&self) -> PersistentLayerKey {
|
||||
PersistentLayerKey {
|
||||
|
||||
@@ -204,11 +204,9 @@ impl<'a> IteratorWrapper<'a> {
|
||||
/// A merge iterator over delta/image layer iterators. When duplicated records are
|
||||
/// found, the iterator will not perform any deduplication, and the caller should handle
|
||||
/// these situation. By saying duplicated records, there are many possibilities:
|
||||
///
|
||||
/// * Two same delta at the same LSN.
|
||||
/// * Two same image at the same LSN.
|
||||
/// * Delta/image at the same LSN where the image has already applied the delta.
|
||||
///
|
||||
/// The iterator will always put the image before the delta.
|
||||
pub struct MergeIterator<'a> {
|
||||
heap: BinaryHeap<IteratorWrapper<'a>>,
|
||||
|
||||
@@ -1,454 +0,0 @@
|
||||
use std::{ops::Range, sync::Arc};
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
|
||||
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
|
||||
|
||||
use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
|
||||
|
||||
/// An image writer that takes images and produces multiple image layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
|
||||
/// to be cleaned up)
|
||||
#[must_use]
|
||||
pub struct SplitImageLayerWriter {
|
||||
inner: ImageLayerWriter,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<ResidentLayer>,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl SplitImageLayerWriter {
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
start_key: Key,
|
||||
lsn: Lsn,
|
||||
target_layer_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
inner: ImageLayerWriter::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
&(start_key..Key::MAX),
|
||||
lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
generated_layers: Vec::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn put_image(
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// The current estimation is an upper bound of the space that the key/image could take
|
||||
// because we did not consider compression in this estimation. The resulting image layer
|
||||
// could be smaller than the target size.
|
||||
let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
|
||||
if self.inner.num_keys() >= 1
|
||||
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||
{
|
||||
let next_image_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(key..Key::MAX),
|
||||
self.lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
|
||||
self.generated_layers.push(
|
||||
prev_image_writer
|
||||
.finish_with_end_key(tline, key, ctx)
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
self.inner.put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<ResidentLayer>> {
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
inner,
|
||||
..
|
||||
} = self;
|
||||
generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
|
||||
/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
|
||||
/// to be cleaned up).
|
||||
#[must_use]
|
||||
pub struct SplitDeltaLayerWriter {
|
||||
inner: DeltaLayerWriter,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<ResidentLayer>,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl SplitDeltaLayerWriter {
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
start_key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
target_layer_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
inner: DeltaLayerWriter::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
start_key,
|
||||
lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
generated_layers: Vec::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn_range,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn put_value(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
|
||||
// number, and therefore the final layer size could be a little bit larger or smaller than the target.
|
||||
let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
|
||||
if self.inner.num_keys() >= 1
|
||||
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||
{
|
||||
let next_delta_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
|
||||
let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
self.generated_layers.push(delta_layer);
|
||||
}
|
||||
self.inner.put_value(key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<ResidentLayer>> {
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
inner,
|
||||
..
|
||||
} = self;
|
||||
|
||||
let (desc, path) = inner.finish(end_key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
generated_layers.push(delta_layer);
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::AsLayerDesc,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
fn get_img(id: u32) -> Bytes {
|
||||
format!("{id:064}").into()
|
||||
}
|
||||
|
||||
fn get_large_img() -> Bytes {
|
||||
vec![0; 8192].into()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_one_image() {
|
||||
let harness = TenantHarness::create("split_writer_write_one_image")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut image_writer = SplitImageLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut delta_writer = SplitDeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = image_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::Image(get_img(0)),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_split() {
|
||||
let harness = TenantHarness::create("split_writer_write_split")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut image_writer = SplitImageLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut delta_writer = SplitDeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
const N: usize = 2000;
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
image_writer
|
||||
.put_image(get_key(i), get_large_img(), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(i),
|
||||
Lsn(0x20),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
let image_layers = image_writer
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
let delta_layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(image_layers.len(), N / 512 + 1);
|
||||
assert_eq!(delta_layers.len(), N / 512 + 1);
|
||||
for idx in 0..image_layers.len() {
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
if idx > 0 {
|
||||
assert_eq!(
|
||||
image_layers[idx - 1].layer_desc().key_range.end,
|
||||
image_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
assert_eq!(
|
||||
delta_layers[idx - 1].layer_desc().key_range.end,
|
||||
delta_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_large_img() {
|
||||
let harness = TenantHarness::create("split_writer_write_large_img")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut image_writer = SplitImageLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut delta_writer = SplitDeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
image_writer
|
||||
.put_image(get_key(1), get_large_img(), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = image_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 2);
|
||||
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::Image(get_img(0)),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(1),
|
||||
Lsn(0x1A),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 2);
|
||||
}
|
||||
}
|
||||
@@ -129,9 +129,11 @@ pub fn start_background_loops(
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let can_start = completion::Barrier::maybe_wait(background_jobs_can_start);
|
||||
let can_start = tenant.ongoing_timeline_detach.gc_sleeping_while(can_start);
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()) },
|
||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||
_ = can_start => {}
|
||||
};
|
||||
gc_loop(tenant, cancel)
|
||||
.instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||
@@ -210,28 +212,24 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
Duration::from_secs(10)
|
||||
} else {
|
||||
// Run compaction
|
||||
match tenant.compaction_iteration(&cancel, &ctx).await {
|
||||
Err(e) => {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
log_compaction_error(
|
||||
&e,
|
||||
error_run_count,
|
||||
&wait_duration,
|
||||
cancel.is_cancelled(),
|
||||
);
|
||||
wait_duration
|
||||
}
|
||||
Ok(has_pending_task) => {
|
||||
error_run_count = 0;
|
||||
// schedule the next compaction immediately in case there is a pending compaction task
|
||||
if has_pending_task { Duration::from_secs(0) } else { period }
|
||||
}
|
||||
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
log_compaction_error(
|
||||
&e,
|
||||
error_run_count,
|
||||
&wait_duration,
|
||||
cancel.is_cancelled(),
|
||||
);
|
||||
wait_duration
|
||||
} else {
|
||||
error_run_count = 0;
|
||||
period
|
||||
}
|
||||
};
|
||||
|
||||
@@ -365,14 +363,13 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
if first {
|
||||
first = false;
|
||||
|
||||
if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
break;
|
||||
}
|
||||
let delays = async {
|
||||
delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
|
||||
random_init_delay(period, &cancel).await?;
|
||||
Ok::<_, Cancelled>(())
|
||||
};
|
||||
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
if tenant.ongoing_timeline_detach.gc_sleeping_while(delays).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -407,16 +404,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
|
||||
if matches!(e, crate::tenant::GcError::TimelineCancelled) {
|
||||
// Timeline was cancelled during gc. We might either be in an event
|
||||
// that affects the entire tenant (tenant deletion, pageserver shutdown),
|
||||
// or in one that affects the timeline only (timeline deletion).
|
||||
// Therefore, don't exit the loop.
|
||||
info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
|
||||
} else {
|
||||
error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
|
||||
}
|
||||
|
||||
error!(
|
||||
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||
);
|
||||
wait_duration
|
||||
}
|
||||
}
|
||||
@@ -425,7 +415,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
let cancelled = cancel.cancelled();
|
||||
let cancelled = tenant.ongoing_timeline_detach.gc_sleeping_while(cancelled);
|
||||
if tokio::time::timeout(sleep_duration, cancelled)
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -63,19 +63,10 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline: &Timeline,
|
||||
) -> anyhow::Result<()> {
|
||||
// Always ensure the lock order is compaction -> gc.
|
||||
let compaction_lock = timeline.compaction_lock.lock();
|
||||
let compaction_lock = crate::timed(
|
||||
compaction_lock,
|
||||
"acquires compaction lock",
|
||||
std::time::Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
|
||||
let gc_lock = timeline.gc_lock.lock();
|
||||
let gc_lock = crate::timed(
|
||||
gc_lock,
|
||||
"acquires gc lock",
|
||||
let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
|
||||
let guards = crate::timed(
|
||||
guards,
|
||||
"acquire gc and compaction locks",
|
||||
std::time::Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
@@ -116,8 +107,7 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
|
||||
info!("finished deleting layer files, releasing locks");
|
||||
drop(gc_lock);
|
||||
drop(compaction_lock);
|
||||
drop(guards);
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||
@@ -216,10 +206,11 @@ impl DeleteTimelineFlow {
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
#[instrument(skip_all)]
|
||||
#[instrument(skip_all, fields(%inplace))]
|
||||
pub async fn run(
|
||||
tenant: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
inplace: bool,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
super::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -230,7 +221,7 @@ impl DeleteTimelineFlow {
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Hard).await;
|
||||
|
||||
tenant.gc_block.before_delete(&timeline);
|
||||
tenant.ongoing_timeline_detach.on_delete(&timeline);
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
@@ -246,7 +237,11 @@ impl DeleteTimelineFlow {
|
||||
))?
|
||||
});
|
||||
|
||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||
if inplace {
|
||||
Self::background(guard, tenant.conf, tenant, &timeline).await?
|
||||
} else {
|
||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -213,45 +213,51 @@ impl Timeline {
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
{
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
for layer in layers.iter_historic_layers() {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
|
||||
guard
|
||||
.likely_resident_layers()
|
||||
.filter(|layer| {
|
||||
let last_activity_ts = layer.latest_activity();
|
||||
// guard against eviction while we inspect it; it might be that eviction_task and
|
||||
// disk_usage_eviction_task both select the same layers to be evicted, and
|
||||
// seemingly free up double the space. both succeeding is of no consequence.
|
||||
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
Err(_e) => {
|
||||
// We reach here if `now` < `last_activity_ts`, which can legitimately
|
||||
// happen if there is an access between us getting `now`, and us getting
|
||||
// the access stats from the layer.
|
||||
//
|
||||
// The other reason why it can happen is system clock skew because
|
||||
// SystemTime::now() is not monotonic, so, even if there is no access
|
||||
// to the layer after we get `now` at the beginning of this function,
|
||||
// it could be that `now` < `last_activity_ts`.
|
||||
//
|
||||
// To distinguish the cases, we would need to record `Instant`s in the
|
||||
// access stats (i.e., monotonic timestamps), but then, the timestamps
|
||||
// values in the access stats would need to be `Instant`'s, and hence
|
||||
// they would be meaningless outside of the pageserver process.
|
||||
// At the time of writing, the trade-off is that access stats are more
|
||||
// valuable than detecting clock skew.
|
||||
return false;
|
||||
}
|
||||
};
|
||||
if !layer.is_likely_resident() {
|
||||
continue;
|
||||
}
|
||||
|
||||
no_activity_for > p.threshold
|
||||
})
|
||||
.cloned()
|
||||
.for_each(|layer| {
|
||||
let last_activity_ts = layer.access_stats().latest_activity();
|
||||
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
Err(_e) => {
|
||||
// We reach here if `now` < `last_activity_ts`, which can legitimately
|
||||
// happen if there is an access between us getting `now`, and us getting
|
||||
// the access stats from the layer.
|
||||
//
|
||||
// The other reason why it can happen is system clock skew because
|
||||
// SystemTime::now() is not monotonic, so, even if there is no access
|
||||
// to the layer after we get `now` at the beginning of this function,
|
||||
// it could be that `now` < `last_activity_ts`.
|
||||
//
|
||||
// To distinguish the cases, we would need to record `Instant`s in the
|
||||
// access stats (i.e., monotonic timestamps), but then, the timestamps
|
||||
// values in the access stats would need to be `Instant`'s, and hence
|
||||
// they would be meaningless outside of the pageserver process.
|
||||
// At the time of writing, the trade-off is that access stats are more
|
||||
// valuable than detecting clock skew.
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if no_activity_for > p.threshold {
|
||||
js.spawn(async move {
|
||||
layer
|
||||
.evict_and_wait(std::time::Duration::from_secs(5))
|
||||
.await
|
||||
});
|
||||
stats.candidates += 1;
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let join_all = async move {
|
||||
|
||||
@@ -1,967 +0,0 @@
|
||||
//! An efficient way to keep the timeline gate open without preventing
|
||||
//! timeline shutdown for longer than a single call to a timeline method.
|
||||
//!
|
||||
//! # Motivation
|
||||
//!
|
||||
//! On a single page service connection, we're typically serving a single TenantTimelineId.
|
||||
//!
|
||||
//! Without sharding, there is a single Timeline object to which we dispatch
|
||||
//! all requests. For example, a getpage request gets dispatched to the
|
||||
//! Timeline::get method of the Timeline object that represents the
|
||||
//! (tenant,timeline) of that connection.
|
||||
//!
|
||||
//! With sharding, for each request that comes in on the connection,
|
||||
//! we first have to perform shard routing based on the requested key (=~ page number).
|
||||
//! The result of shard routing is a Timeline object.
|
||||
//! We then dispatch the request to that Timeline object.
|
||||
//!
|
||||
//! Regardless of whether the tenant is sharded or not, we want to ensure that
|
||||
//! we hold the Timeline gate open while we're invoking the method on the
|
||||
//! Timeline object.
|
||||
//!
|
||||
//! However, we want to avoid the overhead of entering the gate for every
|
||||
//! method invocation.
|
||||
//!
|
||||
//! Further, for shard routing, we want to avoid calling the tenant manager to
|
||||
//! resolve the shard for every request. Instead, we want to cache the
|
||||
//! routing result so we can bypass the tenant manager for all subsequent requests
|
||||
//! that get routed to that shard.
|
||||
//!
|
||||
//! Regardless of how we accomplish the above, it should not
|
||||
//! prevent the Timeline from shutting down promptly.
|
||||
//!
|
||||
//! # Design
|
||||
//!
|
||||
//! There are three user-facing data structures:
|
||||
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
|
||||
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
|
||||
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
|
||||
//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
|
||||
//!
|
||||
//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
|
||||
//!
|
||||
//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
|
||||
//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
|
||||
//!
|
||||
//! To dispatch a request, the page service connection calls `Cache::get`.
|
||||
//!
|
||||
//! A cache miss means we consult the tenant manager for shard routing,
|
||||
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
|
||||
//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
|
||||
//! and the `Arc<HandleInner>` in the `PerTimelineState`.
|
||||
//!
|
||||
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
|
||||
//! and find the `Weak<HandleInner>` in the cache.
|
||||
//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
|
||||
//!
|
||||
//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
|
||||
//!
|
||||
//! # Memory Management / How The Reference Cycle Is Broken
|
||||
//!
|
||||
//! The attentive reader may have noticed the strong reference cycle
|
||||
//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
|
||||
//!
|
||||
//! This cycle is intentional: while it exists, the `Cache` can upgrade its
|
||||
//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
|
||||
//!
|
||||
//! The cycle is broken by either
|
||||
//! - `PerTimelineState::shutdown` or
|
||||
//! - dropping the `Cache`.
|
||||
//!
|
||||
//! Concurrently existing `Handle`s will extend the existence of the cycle.
|
||||
//! However, since `Handle`s are short-lived and new `Handle`s are not
|
||||
//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
|
||||
//! that extension of the cycle is bounded.
|
||||
//!
|
||||
//! # Fast Path for Shard Routing
|
||||
//!
|
||||
//! The `Cache` has a fast path for shard routing to avoid calling into
|
||||
//! the tenant manager for every request.
|
||||
//!
|
||||
//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
|
||||
//!
|
||||
//! The current implementation uses the first entry in the hash map
|
||||
//! to determine the `ShardParameters` and derive the correct
|
||||
//! `ShardIndex` for the requested key.
|
||||
//!
|
||||
//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
|
||||
//!
|
||||
//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
|
||||
//! it's a hit.
|
||||
//!
|
||||
//! ## Cache invalidation
|
||||
//!
|
||||
//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
|
||||
//! The only reasons why an entry in the cache can become stale are:
|
||||
//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
|
||||
//! being detached, timeline or shard deleted, or pageserver is shutting down.
|
||||
//! 2. We're doing a shard split and new traffic should be routed to the child shards.
|
||||
//!
|
||||
//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
|
||||
//! timeline has shut down, and when that happens, we remove the entry from the cache.
|
||||
//!
|
||||
//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
|
||||
//! to the parent shard during a shard split. Eventually, the shard split task will
|
||||
//! shut down the parent => case (1).
|
||||
|
||||
use std::collections::hash_map;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::Weak;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use tracing::instrument;
|
||||
use tracing::trace;
|
||||
use utils::id::TimelineId;
|
||||
use utils::shard::ShardIndex;
|
||||
use utils::shard::ShardNumber;
|
||||
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
|
||||
/// The requirement for Debug is so that #[derive(Debug)] works in some places.
|
||||
pub(crate) trait Types: Sized + std::fmt::Debug {
|
||||
type TenantManagerError: Sized + std::fmt::Debug;
|
||||
type TenantManager: TenantManager<Self> + Sized;
|
||||
type Timeline: ArcTimeline<Self> + Sized;
|
||||
}
|
||||
|
||||
/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
|
||||
/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
|
||||
/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
|
||||
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
|
||||
struct CacheId(u64);
|
||||
|
||||
impl CacheId {
|
||||
fn next() -> Self {
|
||||
static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
|
||||
let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
if id == 0 {
|
||||
panic!("CacheId::new() returned 0, overflow");
|
||||
}
|
||||
Self(id)
|
||||
}
|
||||
}
|
||||
|
||||
/// See module-level comment.
|
||||
pub(crate) struct Cache<T: Types> {
|
||||
id: CacheId,
|
||||
map: Map<T>,
|
||||
}
|
||||
|
||||
type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
|
||||
|
||||
impl<T: Types> Default for Cache<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
id: CacheId::next(),
|
||||
map: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
|
||||
pub(crate) struct ShardTimelineId {
|
||||
pub(crate) shard_index: ShardIndex,
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
/// See module-level comment.
|
||||
pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
|
||||
struct HandleInner<T: Types> {
|
||||
shut_down: AtomicBool,
|
||||
timeline: T::Timeline,
|
||||
// The timeline's gate held open.
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
|
||||
///
|
||||
/// See module-level comment for details.
|
||||
pub struct PerTimelineState<T: Types> {
|
||||
// None = shutting down
|
||||
handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
|
||||
}
|
||||
|
||||
impl<T: Types> Default for PerTimelineState<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
handles: Mutex::new(Some(Default::default())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Abstract view of [`crate::tenant::mgr`], for testability.
|
||||
pub(crate) trait TenantManager<T: Types> {
|
||||
/// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
|
||||
/// Errors are returned as [`GetError::TenantManager`].
|
||||
async fn resolve(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> Result<T::Timeline, T::TenantManagerError>;
|
||||
}
|
||||
|
||||
/// Abstract view of an [`Arc<Timeline>`], for testability.
|
||||
pub(crate) trait ArcTimeline<T: Types>: Clone {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate;
|
||||
fn shard_timeline_id(&self) -> ShardTimelineId;
|
||||
fn get_shard_identity(&self) -> &ShardIdentity;
|
||||
fn per_timeline_state(&self) -> &PerTimelineState<T>;
|
||||
}
|
||||
|
||||
/// Errors returned by [`Cache::get`].
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum GetError<T: Types> {
|
||||
TenantManager(T::TenantManagerError),
|
||||
TimelineGateClosed,
|
||||
PerTimelineStateShutDown,
|
||||
}
|
||||
|
||||
/// Internal type used in [`Cache::get`].
|
||||
enum RoutingResult<T: Types> {
|
||||
FastPath(Handle<T>),
|
||||
SlowPath(ShardTimelineId),
|
||||
NeedConsultTenantManager,
|
||||
}
|
||||
|
||||
impl<T: Types> Cache<T> {
|
||||
/// See module-level comment for details.
|
||||
///
|
||||
/// Does NOT check for the shutdown state of [`Types::Timeline`].
|
||||
/// Instead, the methods of [`Types::Timeline`] that are invoked through
|
||||
/// the [`Handle`] are responsible for checking these conditions
|
||||
/// and if so, return an error that causes the page service to
|
||||
/// close the connection.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(crate) async fn get(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
// terminates because each iteration removes an element from the map
|
||||
loop {
|
||||
let handle = self
|
||||
.get_impl(timeline_id, shard_selector, tenant_manager)
|
||||
.await?;
|
||||
if handle.0.shut_down.load(Ordering::Relaxed) {
|
||||
let removed = self
|
||||
.map
|
||||
.remove(&handle.0.timeline.shard_timeline_id())
|
||||
.expect("invariant of get_impl is that the returned handle is in the map");
|
||||
assert!(
|
||||
Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
|
||||
"shard_timeline_id() incorrect?"
|
||||
);
|
||||
} else {
|
||||
return Ok(handle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
async fn get_impl(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
let miss: ShardSelector = {
|
||||
let routing_state = self.shard_routing(timeline_id, shard_selector);
|
||||
match routing_state {
|
||||
RoutingResult::FastPath(handle) => return Ok(handle),
|
||||
RoutingResult::SlowPath(key) => match self.map.get(&key) {
|
||||
Some(cached) => match cached.upgrade() {
|
||||
Some(upgraded) => return Ok(Handle(upgraded)),
|
||||
None => {
|
||||
trace!("handle cache stale");
|
||||
self.map.remove(&key).unwrap();
|
||||
ShardSelector::Known(key.shard_index)
|
||||
}
|
||||
},
|
||||
None => ShardSelector::Known(key.shard_index),
|
||||
},
|
||||
RoutingResult::NeedConsultTenantManager => shard_selector,
|
||||
}
|
||||
};
|
||||
self.get_miss(timeline_id, miss, tenant_manager).await
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn shard_routing(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> RoutingResult<T> {
|
||||
loop {
|
||||
// terminates because when every iteration we remove an element from the map
|
||||
let Some((first_key, first_handle)) = self.map.iter().next() else {
|
||||
return RoutingResult::NeedConsultTenantManager;
|
||||
};
|
||||
let Some(first_handle) = first_handle.upgrade() else {
|
||||
// TODO: dedup with get()
|
||||
trace!("handle cache stale");
|
||||
let first_key_owned = *first_key;
|
||||
self.map.remove(&first_key_owned).unwrap();
|
||||
continue;
|
||||
};
|
||||
|
||||
let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
|
||||
let make_shard_index = |shard_num: ShardNumber| ShardIndex {
|
||||
shard_number: shard_num,
|
||||
shard_count: first_handle_shard_identity.count,
|
||||
};
|
||||
|
||||
let need_idx = match shard_selector {
|
||||
ShardSelector::Page(key) => {
|
||||
make_shard_index(first_handle_shard_identity.get_shard_number(&key))
|
||||
}
|
||||
ShardSelector::Zero => make_shard_index(ShardNumber(0)),
|
||||
ShardSelector::Known(shard_idx) => shard_idx,
|
||||
};
|
||||
let need_shard_timeline_id = ShardTimelineId {
|
||||
shard_index: need_idx,
|
||||
timeline_id,
|
||||
};
|
||||
let first_handle_shard_timeline_id = ShardTimelineId {
|
||||
shard_index: first_handle_shard_identity.shard_index(),
|
||||
timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
|
||||
};
|
||||
|
||||
if need_shard_timeline_id == first_handle_shard_timeline_id {
|
||||
return RoutingResult::FastPath(Handle(first_handle));
|
||||
} else {
|
||||
return RoutingResult::SlowPath(need_shard_timeline_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
#[inline(always)]
|
||||
async fn get_miss(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
match tenant_manager.resolve(timeline_id, shard_selector).await {
|
||||
Ok(timeline) => {
|
||||
let key = timeline.shard_timeline_id();
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
|
||||
ShardSelector::Page(_) => (), // gotta trust tenant_manager
|
||||
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
|
||||
}
|
||||
|
||||
let gate_guard = match timeline.gate().enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetError::TimelineGateClosed);
|
||||
}
|
||||
};
|
||||
trace!("creating new HandleInner");
|
||||
let handle = Arc::new(
|
||||
// TODO: global metric that keeps track of the number of live HandlerTimeline instances
|
||||
// so we can identify reference cycle bugs.
|
||||
HandleInner {
|
||||
shut_down: AtomicBool::new(false),
|
||||
_gate_guard: gate_guard,
|
||||
timeline: timeline.clone(),
|
||||
},
|
||||
);
|
||||
let handle = {
|
||||
let mut lock_guard = timeline
|
||||
.per_timeline_state()
|
||||
.handles
|
||||
.lock()
|
||||
.expect("mutex poisoned");
|
||||
match &mut *lock_guard {
|
||||
Some(per_timeline_state) => {
|
||||
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
|
||||
assert!(replaced.is_none(), "some earlier code left a stale handle");
|
||||
match self.map.entry(key) {
|
||||
hash_map::Entry::Occupied(_o) => {
|
||||
// This cannot not happen because
|
||||
// 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
|
||||
// 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
|
||||
// while we were waiting for the tenant manager.
|
||||
unreachable!()
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(Arc::downgrade(&handle));
|
||||
handle
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
return Err(GetError::PerTimelineStateShutDown);
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(Handle(handle))
|
||||
}
|
||||
Err(e) => Err(GetError::TenantManager(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> PerTimelineState<T> {
|
||||
/// After this method returns, [`Cache::get`] will never again return a [`Handle`]
|
||||
/// to the [`Types::Timeline`] that embeds this per-timeline state.
|
||||
/// Even if [`TenantManager::resolve`] would still resolve to it.
|
||||
///
|
||||
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
|
||||
/// That's ok because they're short-lived. See module-level comment for details.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(super) fn shutdown(&self) {
|
||||
let handles = self
|
||||
.handles
|
||||
.lock()
|
||||
.expect("mutex poisoned")
|
||||
// NB: this .take() sets locked to None.
|
||||
// That's what makes future `Cache::get` misses fail.
|
||||
// Cache hits are taken care of below.
|
||||
.take();
|
||||
let Some(handles) = handles else {
|
||||
trace!("already shut down");
|
||||
return;
|
||||
};
|
||||
for handle in handles.values() {
|
||||
// Make hits fail.
|
||||
handle.shut_down.store(true, Ordering::Relaxed);
|
||||
}
|
||||
drop(handles);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||
type Target = T::Timeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0.timeline
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<T: Types> Drop for HandleInner<T> {
|
||||
fn drop(&mut self) {
|
||||
trace!("HandleInner dropped");
|
||||
}
|
||||
}
|
||||
|
||||
// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
|
||||
impl<T: Types> Drop for Cache<T> {
|
||||
fn drop(&mut self) {
|
||||
for (_, weak) in self.map.drain() {
|
||||
if let Some(strong) = weak.upgrade() {
|
||||
// handle is still being kept alive in PerTimelineState
|
||||
let timeline = strong.timeline.per_timeline_state();
|
||||
let mut handles = timeline.handles.lock().expect("mutex poisoned");
|
||||
if let Some(handles) = &mut *handles {
|
||||
let Some(removed) = handles.remove(&self.id) else {
|
||||
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
|
||||
continue;
|
||||
};
|
||||
assert!(Arc::ptr_eq(&removed, &strong));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::{
|
||||
key::{rel_block_to_key, Key, DBDIR_KEY},
|
||||
models::ShardParameters,
|
||||
reltag::RelTag,
|
||||
shard::ShardStripeSize,
|
||||
};
|
||||
use utils::shard::ShardCount;
|
||||
|
||||
use super::*;
|
||||
|
||||
const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TestTypes;
|
||||
impl Types for TestTypes {
|
||||
type TenantManagerError = anyhow::Error;
|
||||
type TenantManager = StubManager;
|
||||
type Timeline = Arc<StubTimeline>;
|
||||
}
|
||||
|
||||
struct StubManager {
|
||||
shards: Vec<Arc<StubTimeline>>,
|
||||
}
|
||||
|
||||
struct StubTimeline {
|
||||
gate: utils::sync::gate::Gate,
|
||||
id: TimelineId,
|
||||
shard: ShardIdentity,
|
||||
per_timeline_state: PerTimelineState<TestTypes>,
|
||||
myself: Weak<StubTimeline>,
|
||||
}
|
||||
|
||||
impl StubTimeline {
|
||||
fn getpage(&self) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
|
||||
impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate {
|
||||
&self.gate
|
||||
}
|
||||
|
||||
fn shard_timeline_id(&self) -> ShardTimelineId {
|
||||
ShardTimelineId {
|
||||
shard_index: self.shard.shard_index(),
|
||||
timeline_id: self.id,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_shard_identity(&self) -> &ShardIdentity {
|
||||
&self.shard
|
||||
}
|
||||
|
||||
fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
|
||||
&self.per_timeline_state
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantManager<TestTypes> for StubManager {
|
||||
async fn resolve(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> anyhow::Result<Arc<StubTimeline>> {
|
||||
for timeline in &self.shards {
|
||||
if timeline.id == timeline_id {
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
}
|
||||
ShardSelector::Zero => continue,
|
||||
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
}
|
||||
ShardSelector::Page(_) => continue,
|
||||
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
}
|
||||
ShardSelector::Known(_) => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
anyhow::bail!("not found")
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_timeline_shutdown() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mgr = StubManager {
|
||||
shards: vec![shard0.clone()],
|
||||
};
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
//
|
||||
// fill the cache
|
||||
//
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
let handle: Handle<_> = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
let handle_inner_weak = Arc::downgrade(&handle.0);
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
assert_eq!(
|
||||
(
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
Weak::weak_count(&handle_inner_weak)
|
||||
),
|
||||
(2, 2),
|
||||
"strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
|
||||
);
|
||||
assert_eq!(cache.map.len(), 1);
|
||||
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
//
|
||||
// demonstrate that Handle holds up gate closure
|
||||
// but shutdown prevents new handles from being handed out
|
||||
//
|
||||
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => {
|
||||
panic!("cache and per-timeline handler state keep cache open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
// NB: first poll of close() makes it enter closing state
|
||||
}
|
||||
}
|
||||
|
||||
let handle = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
|
||||
// SHUTDOWN
|
||||
shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
|
||||
|
||||
assert_eq!(
|
||||
1,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"through local var handle"
|
||||
);
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
1,
|
||||
"this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(via handle), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// this handle is perfectly usable
|
||||
handle.getpage();
|
||||
|
||||
cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
0,
|
||||
"first access after shutdown cleans up the Weak's from the cache"
|
||||
);
|
||||
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => {
|
||||
panic!("handle is keeping gate open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => { }
|
||||
}
|
||||
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
0,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"the HandleInner destructor already ran"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// closing gate succeeds after dropping handle
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("handle is dropped, no other gate holders exist")
|
||||
}
|
||||
}
|
||||
|
||||
// map gets cleaned on next lookup
|
||||
cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown");
|
||||
assert_eq!(cache.map.len(), 0);
|
||||
|
||||
// ensure all refs to shard0 are gone and we're not leaking anything
|
||||
let myself = Weak::clone(&shard0.myself);
|
||||
drop(shard0);
|
||||
drop(mgr);
|
||||
assert_eq!(Weak::strong_count(&myself), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multiple_timelines_and_deletion() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
|
||||
let timeline_a = TimelineId::generate();
|
||||
let timeline_b = TimelineId::generate();
|
||||
assert_ne!(timeline_a, timeline_b);
|
||||
let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_a,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_b,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mut mgr = StubManager {
|
||||
shards: vec![timeline_a.clone(), timeline_b.clone()],
|
||||
};
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
cache
|
||||
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have it");
|
||||
cache
|
||||
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert_eq!(cache.map.len(), 2);
|
||||
|
||||
// delete timeline A
|
||||
timeline_a.per_timeline_state.shutdown();
|
||||
mgr.shards.retain(|t| t.id != timeline_a.id);
|
||||
assert!(
|
||||
mgr.resolve(timeline_a.id, ShardSelector::Page(key))
|
||||
.await
|
||||
.is_err(),
|
||||
"broken StubManager implementation"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
2,
|
||||
"cache still has a Weak handle to Timeline A"
|
||||
);
|
||||
cache
|
||||
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown");
|
||||
assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
|
||||
|
||||
cache
|
||||
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we still have it");
|
||||
}
|
||||
|
||||
fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
|
||||
rel_block_to_key(
|
||||
RelTag {
|
||||
spcnode: 1663,
|
||||
dbnode: 208101,
|
||||
relnode: 2620,
|
||||
forknum: 0,
|
||||
},
|
||||
shard.0 as u32 * params.stripe_size.0,
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_shard_split() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let parent = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let child_params = ShardParameters {
|
||||
count: ShardCount(2),
|
||||
stripe_size: ShardStripeSize::default(),
|
||||
};
|
||||
let child0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let child1 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let child_shards_by_shard_number = [child0.clone(), child1.clone()];
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
// fill the cache with the parent
|
||||
for i in 0..2 {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![parent.clone()],
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(
|
||||
Weak::ptr_eq(&handle.myself, &parent.myself),
|
||||
"mgr returns parent first"
|
||||
);
|
||||
drop(handle);
|
||||
}
|
||||
|
||||
//
|
||||
// SHARD SPLIT: tenant manager changes, but the cache isn't informed
|
||||
//
|
||||
|
||||
// while we haven't shut down the parent, the cache will return the cached parent, even
|
||||
// if the tenant manager returns the child
|
||||
for i in 0..2 {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![], // doesn't matter what's in here, the cache is fully loaded
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(
|
||||
Weak::ptr_eq(&handle.myself, &parent.myself),
|
||||
"mgr returns parent"
|
||||
);
|
||||
drop(handle);
|
||||
}
|
||||
|
||||
let parent_handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![parent.clone()],
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
|
||||
|
||||
// invalidate the cache
|
||||
parent.per_timeline_state.shutdown();
|
||||
|
||||
// the cache will now return the child, even though the parent handle still exists
|
||||
for i in 0..2 {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(
|
||||
Weak::ptr_eq(
|
||||
&handle.myself,
|
||||
&child_shards_by_shard_number[i as usize].myself
|
||||
),
|
||||
"mgr returns child"
|
||||
);
|
||||
drop(handle);
|
||||
}
|
||||
|
||||
// all the while the parent handle kept the parent gate open
|
||||
tokio::select! {
|
||||
_ = parent_handle.gate.close() => {
|
||||
panic!("parent handle is keeping gate open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => { }
|
||||
}
|
||||
drop(parent_handle);
|
||||
tokio::select! {
|
||||
_ = parent.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("parent handle is dropped, no other gate holders exist")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_connection_handler_exit() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mgr = StubManager {
|
||||
shards: vec![shard0.clone()],
|
||||
};
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
// Simulate 10 connections that's opened, used, and closed
|
||||
let mut used_handles = vec![];
|
||||
for _ in 0..10 {
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
let handle = {
|
||||
let handle = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
handle
|
||||
};
|
||||
handle.getpage();
|
||||
used_handles.push(Arc::downgrade(&handle.0));
|
||||
}
|
||||
|
||||
// No handles exist, thus gates are closed and don't require shutdown
|
||||
assert!(used_handles
|
||||
.iter()
|
||||
.all(|weak| Weak::strong_count(weak) == 0));
|
||||
|
||||
// ... thus the gate should close immediately, even without shutdown
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("handle is dropped, no other gate holders exist")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
@@ -24,142 +24,35 @@ use crate::{
|
||||
use super::TimelineWriterState;
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
pub(crate) enum LayerManager {
|
||||
/// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
|
||||
/// the layers.
|
||||
Open(OpenLayerManager),
|
||||
/// Shutdown layer manager where there are no more in-memory layers and persistent layers are
|
||||
/// read-only.
|
||||
Closed {
|
||||
layers: HashMap<PersistentLayerKey, Layer>,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for LayerManager {
|
||||
fn default() -> Self {
|
||||
LayerManager::Open(OpenLayerManager::default())
|
||||
}
|
||||
#[derive(Default)]
|
||||
pub(crate) struct LayerManager {
|
||||
layer_map: LayerMap,
|
||||
layer_fmgr: LayerFileManager<Layer>,
|
||||
}
|
||||
|
||||
impl LayerManager {
|
||||
pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||
self.layers()
|
||||
.get(key)
|
||||
.with_context(|| format!("get layer from key: {key}"))
|
||||
.expect("not found")
|
||||
.clone()
|
||||
}
|
||||
|
||||
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
|
||||
self.get_from_key(&desc.key())
|
||||
self.layer_fmgr.get_from_desc(desc)
|
||||
}
|
||||
|
||||
/// Get an immutable reference to the layer map.
|
||||
///
|
||||
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
|
||||
/// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
|
||||
pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> {
|
||||
use LayerManager::*;
|
||||
match self {
|
||||
Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map),
|
||||
Closed { .. } => Err(Shutdown),
|
||||
}
|
||||
pub(crate) fn layer_map(&self) -> &LayerMap {
|
||||
&self.layer_map
|
||||
}
|
||||
|
||||
pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> {
|
||||
use LayerManager::*;
|
||||
|
||||
match self {
|
||||
Open(open) => Ok(open),
|
||||
Closed { .. } => Err(Shutdown),
|
||||
}
|
||||
}
|
||||
|
||||
/// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in
|
||||
/// order to allow shutdown to complete.
|
||||
///
|
||||
/// If there was a want to flush in-memory layers, it must have happened earlier.
|
||||
pub(crate) fn shutdown(&mut self, writer_state: &mut Option<TimelineWriterState>) {
|
||||
use LayerManager::*;
|
||||
match self {
|
||||
Open(OpenLayerManager {
|
||||
layer_map,
|
||||
layer_fmgr: LayerFileManager(hashmap),
|
||||
}) => {
|
||||
let open = layer_map.open_layer.take();
|
||||
let frozen = layer_map.frozen_layers.len();
|
||||
let taken_writer_state = writer_state.take();
|
||||
tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers");
|
||||
let layers = std::mem::take(hashmap);
|
||||
*self = Closed { layers };
|
||||
assert_eq!(open.is_some(), taken_writer_state.is_some());
|
||||
}
|
||||
Closed { .. } => {
|
||||
tracing::debug!("ignoring multiple shutdowns on layer manager")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Sum up the historic layer sizes
|
||||
pub(crate) fn layer_size_sum(&self) -> u64 {
|
||||
self.layers()
|
||||
.values()
|
||||
.map(|l| l.layer_desc().file_size)
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = &'_ Layer> + '_ {
|
||||
self.layers().values().filter(|l| l.is_likely_resident())
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
||||
self.contains_key(&layer.layer_desc().key())
|
||||
}
|
||||
|
||||
pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||
self.layers().contains_key(key)
|
||||
}
|
||||
|
||||
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
|
||||
self.layers().keys().cloned().collect_vec()
|
||||
}
|
||||
|
||||
fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
|
||||
use LayerManager::*;
|
||||
match self {
|
||||
Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0,
|
||||
Closed { layers } => layers,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct OpenLayerManager {
|
||||
layer_map: LayerMap,
|
||||
layer_fmgr: LayerFileManager<Layer>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for OpenLayerManager {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("OpenLayerManager")
|
||||
.field("layer_count", &self.layer_fmgr.0.len())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("layer manager has been shutdown")]
|
||||
pub(crate) struct Shutdown;
|
||||
|
||||
impl OpenLayerManager {
|
||||
/// Called from `load_layer_map`. Initialize the layer manager with:
|
||||
/// 1. all on-disk layers
|
||||
/// 2. next open layer (with disk disk_consistent_lsn LSN)
|
||||
pub(crate) fn initialize_local_layers(&mut self, layers: Vec<Layer>, next_open_layer_at: Lsn) {
|
||||
pub(crate) fn initialize_local_layers(
|
||||
&mut self,
|
||||
on_disk_layers: Vec<Layer>,
|
||||
next_open_layer_at: Lsn,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for layer in layers {
|
||||
for layer in on_disk_layers {
|
||||
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
updates.flush();
|
||||
@@ -171,19 +64,26 @@ impl OpenLayerManager {
|
||||
self.layer_map.next_open_layer_at = Some(next_open_layer_at);
|
||||
}
|
||||
|
||||
/// Open a new writable layer to append data if there is no open layer, otherwise return the
|
||||
/// current open layer, called within `get_layer_for_write`.
|
||||
/// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
|
||||
/// called within `get_layer_for_write`.
|
||||
pub(crate) async fn get_layer_for_write(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
last_record_lsn: Lsn,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<InMemoryLayer>> {
|
||||
) -> Result<Arc<InMemoryLayer>> {
|
||||
ensure!(lsn.is_aligned());
|
||||
|
||||
ensure!(
|
||||
lsn > last_record_lsn,
|
||||
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
|
||||
lsn,
|
||||
last_record_lsn,
|
||||
);
|
||||
|
||||
// Do we have a layer open for writing already?
|
||||
let layer = if let Some(open_layer) = &self.layer_map.open_layer {
|
||||
if open_layer.get_lsn_range().start > lsn {
|
||||
@@ -209,15 +109,8 @@ impl OpenLayerManager {
|
||||
lsn
|
||||
);
|
||||
|
||||
let new_layer = InMemoryLayer::create(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
start_lsn,
|
||||
gate_guard,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let new_layer =
|
||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
|
||||
let layer = Arc::new(new_layer);
|
||||
|
||||
self.layer_map.open_layer = Some(layer.clone());
|
||||
@@ -271,7 +164,7 @@ impl OpenLayerManager {
|
||||
froze
|
||||
}
|
||||
|
||||
/// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`].
|
||||
/// Add image layers to the layer map, called from `create_image_layers`.
|
||||
pub(crate) fn track_new_image_layers(
|
||||
&mut self,
|
||||
image_layers: &[ResidentLayer],
|
||||
@@ -344,7 +237,7 @@ impl OpenLayerManager {
|
||||
self.finish_compact_l0(compact_from, compact_to, metrics)
|
||||
}
|
||||
|
||||
/// Called post-compaction when some previous generation image layers were trimmed.
|
||||
/// Called when compaction is completed.
|
||||
pub(crate) fn rewrite_layers(
|
||||
&mut self,
|
||||
rewrite_layers: &[(Layer, ResidentLayer)],
|
||||
@@ -362,10 +255,13 @@ impl OpenLayerManager {
|
||||
new_layer.layer_desc().lsn_range
|
||||
);
|
||||
|
||||
// Transfer visibility hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to
|
||||
// Transfer visibilty hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to
|
||||
// be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
|
||||
// always marking rewritten layers as visible.
|
||||
new_layer.as_ref().set_visibility(old_layer.visibility());
|
||||
new_layer
|
||||
.as_ref()
|
||||
.access_stats()
|
||||
.set_visibility(old_layer.access_stats().visibility());
|
||||
|
||||
// Safety: we may never rewrite the same file in-place. Callers are responsible
|
||||
// for ensuring that they only rewrite layers after something changes the path,
|
||||
@@ -433,6 +329,31 @@ impl OpenLayerManager {
|
||||
mapping.remove(layer);
|
||||
layer.delete_on_drop();
|
||||
}
|
||||
|
||||
pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
|
||||
// for small layer maps, we most likely have all resident, but for larger more are likely
|
||||
// to be evicted assuming lots of layers correlated with longer lifespan.
|
||||
|
||||
self.layer_map().iter_historic_layers().filter_map(|desc| {
|
||||
self.layer_fmgr
|
||||
.0
|
||||
.get(&desc.key())
|
||||
.filter(|l| l.is_likely_resident())
|
||||
.cloned()
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
||||
self.layer_fmgr.contains(layer)
|
||||
}
|
||||
|
||||
pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||
self.layer_fmgr.contains_key(key)
|
||||
}
|
||||
|
||||
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
|
||||
self.layer_fmgr.0.keys().cloned().collect_vec()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
|
||||
@@ -444,6 +365,20 @@ impl<T> Default for LayerFileManager<T> {
|
||||
}
|
||||
|
||||
impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||
self.0
|
||||
.get(&desc.key())
|
||||
.with_context(|| format!("get layer from desc: {}", desc.layer_name()))
|
||||
.expect("not found")
|
||||
.clone()
|
||||
}
|
||||
|
||||
fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||
self.0.contains_key(key)
|
||||
}
|
||||
|
||||
pub(crate) fn insert(&mut self, layer: T) {
|
||||
let present = self.0.insert(layer.layer_desc().key(), layer.clone());
|
||||
if present.is_some() && cfg!(debug_assertions) {
|
||||
@@ -451,6 +386,10 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &T) -> bool {
|
||||
self.0.contains_key(&layer.layer_desc().key())
|
||||
}
|
||||
|
||||
pub(crate) fn remove(&mut self, layer: &T) {
|
||||
let present = self.0.remove(&layer.layer_desc().key());
|
||||
if present.is_none() && cfg!(debug_assertions) {
|
||||
|
||||
@@ -122,10 +122,6 @@ impl CurrentLogicalSize {
|
||||
Self::Exact(_) => Accuracy::Exact,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_exact(&self) -> bool {
|
||||
matches!(self, Self::Exact(_))
|
||||
}
|
||||
}
|
||||
|
||||
impl LogicalSize {
|
||||
|
||||
@@ -30,12 +30,10 @@ use tokio::time::Instant;
|
||||
pub use pageserver_api::models::virtual_file as api;
|
||||
pub(crate) mod io_engine;
|
||||
pub use io_engine::feature_test as io_engine_feature_test;
|
||||
pub use io_engine::io_engine_for_bench;
|
||||
pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
|
||||
mod metadata;
|
||||
mod open_options;
|
||||
use self::owned_buffers_io::write::OwnedAsyncWriter;
|
||||
pub(crate) use api::DirectIoMode;
|
||||
pub(crate) use io_engine::IoEngineKind;
|
||||
pub(crate) use metadata::Metadata;
|
||||
pub(crate) use open_options::*;
|
||||
|
||||
@@ -328,29 +328,3 @@ pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
|
||||
.join()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
/// For use in benchmark binaries only.
|
||||
///
|
||||
/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also
|
||||
/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste
|
||||
/// developer time trying to figure out why it's slow.
|
||||
///
|
||||
/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic.
|
||||
pub fn io_engine_for_bench() -> IoEngineKind {
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
{
|
||||
panic!("This benchmark does I/O and can only give a representative result on Linux");
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
match feature_test().unwrap() {
|
||||
FeatureTestResult::PlatformPreferred(engine) => engine,
|
||||
FeatureTestResult::Worse {
|
||||
engine: _engine,
|
||||
remark,
|
||||
} => {
|
||||
panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -618,7 +618,7 @@ impl WalIngest {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
std::mem::size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
@@ -685,7 +685,7 @@ impl WalIngest {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
std::mem::size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
@@ -752,7 +752,7 @@ impl WalIngest {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
std::mem::size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
@@ -920,7 +920,7 @@ impl WalIngest {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
std::mem::size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
|
||||
@@ -241,9 +241,6 @@ impl PostgresRedoManager {
|
||||
|
||||
/// Shut down the WAL redo manager.
|
||||
///
|
||||
/// Returns `true` if this call was the one that initiated shutdown.
|
||||
/// `true` may be observed by no caller if the first caller stops polling.
|
||||
///
|
||||
/// After this future completes
|
||||
/// - no redo process is running
|
||||
/// - no new redo process will be spawned
|
||||
@@ -253,32 +250,22 @@ impl PostgresRedoManager {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn shutdown(&self) -> bool {
|
||||
pub async fn shutdown(&self) {
|
||||
// prevent new processes from being spawned
|
||||
let maybe_permit = match self.redo_process.get_or_init_detached().await {
|
||||
let permit = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => {
|
||||
if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
|
||||
None
|
||||
} else {
|
||||
let (proc, permit) = guard.take_and_deinit();
|
||||
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
||||
Some(permit)
|
||||
}
|
||||
let (proc, permit) = guard.take_and_deinit();
|
||||
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
||||
permit
|
||||
}
|
||||
Err(permit) => Some(permit),
|
||||
};
|
||||
let it_was_us = if let Some(permit) = maybe_permit {
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::ManagerShutDown, permit);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
Err(permit) => permit,
|
||||
};
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::ManagerShutDown, permit);
|
||||
// wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
|
||||
// we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
|
||||
// for the underlying process.
|
||||
self.launched_processes.close().await;
|
||||
it_was_us
|
||||
}
|
||||
|
||||
/// This type doesn't have its own background task to check for idleness: we
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
|
||||
# This was captured from one shard of a large tenant in staging.
|
||||
|
||||
# It has a mixture of deltas and image layers, >1000 layers in total.
|
||||
|
||||
# This is suitable for general smoke tests that want an index which is not
|
||||
# trivially small, but doesn't contain weird/pathological cases.
|
||||
File diff suppressed because one or more lines are too long
167
poetry.lock
generated
167
poetry.lock
generated
@@ -870,96 +870,6 @@ files = [
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[[package]]
|
||||
name = "clickhouse-connect"
|
||||
version = "0.7.17"
|
||||
description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
|
||||
optional = false
|
||||
python-versions = "~=3.8"
|
||||
files = [
|
||||
{file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
|
||||
{file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
|
||||
{file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
|
||||
{file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
|
||||
{file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
|
||||
{file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
|
||||
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
|
||||
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
|
||||
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
|
||||
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
|
||||
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
|
||||
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
|
||||
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
|
||||
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
|
||||
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
|
||||
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
|
||||
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
|
||||
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
|
||||
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
|
||||
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
|
||||
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
certifi = "*"
|
||||
lz4 = "*"
|
||||
pytz = "*"
|
||||
urllib3 = ">=1.26"
|
||||
zstandard = "*"
|
||||
|
||||
[package.extras]
|
||||
arrow = ["pyarrow"]
|
||||
numpy = ["numpy"]
|
||||
orjson = ["orjson"]
|
||||
pandas = ["pandas"]
|
||||
sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
|
||||
tzlocal = ["tzlocal (>=4.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.5"
|
||||
@@ -1514,20 +1424,6 @@ files = [
|
||||
[package.dependencies]
|
||||
six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "kafka-python"
|
||||
version = "2.0.2"
|
||||
description = "Pure Python client for Apache Kafka"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
|
||||
{file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
crc32c = ["crc32c"]
|
||||
|
||||
[[package]]
|
||||
name = "lazy-object-proxy"
|
||||
version = "1.10.0"
|
||||
@@ -1574,56 +1470,6 @@ files = [
|
||||
{file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lz4"
|
||||
version = "4.3.3"
|
||||
description = "LZ4 Bindings for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
|
||||
{file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
|
||||
{file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
|
||||
{file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
|
||||
{file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
|
||||
{file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
|
||||
{file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
|
||||
flake8 = ["flake8"]
|
||||
tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
|
||||
|
||||
[[package]]
|
||||
name = "markupsafe"
|
||||
version = "2.1.1"
|
||||
@@ -2515,17 +2361,6 @@ files = [
|
||||
[package.dependencies]
|
||||
six = ">=1.5"
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2024.1"
|
||||
description = "World timezone definitions, modern and historical"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
|
||||
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pywin32"
|
||||
version = "301"
|
||||
@@ -3371,4 +3206,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
|
||||
content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
|
||||
|
||||
@@ -92,7 +92,6 @@ tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
tracing.workspace = true
|
||||
try-lock.workspace = true
|
||||
typed-json.workspace = true
|
||||
url.workspace = true
|
||||
urlencoding.workspace = true
|
||||
|
||||
@@ -218,7 +218,7 @@ impl RateBucketInfo {
|
||||
impl AuthenticationConfig {
|
||||
pub fn check_rate_limit(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
config: &AuthenticationConfig,
|
||||
secret: AuthSecret,
|
||||
endpoint: &EndpointId,
|
||||
@@ -243,7 +243,7 @@ impl AuthenticationConfig {
|
||||
let limit_not_exceeded = self.rate_limiter.check(
|
||||
(
|
||||
endpoint_int,
|
||||
MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet),
|
||||
MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
|
||||
),
|
||||
password_weight,
|
||||
);
|
||||
@@ -274,7 +274,7 @@ impl AuthenticationConfig {
|
||||
///
|
||||
/// All authentication flows will emit an AuthenticationOk message if successful.
|
||||
async fn auth_quirks(
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
api: &impl console::Api,
|
||||
user_info: ComputeUserInfoMaybeEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
@@ -303,8 +303,8 @@ async fn auth_quirks(
|
||||
let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
|
||||
|
||||
// check allowed list
|
||||
if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
|
||||
return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
|
||||
if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
|
||||
return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
|
||||
}
|
||||
|
||||
if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
|
||||
@@ -356,7 +356,7 @@ async fn auth_quirks(
|
||||
}
|
||||
|
||||
async fn authenticate_with_secret(
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
secret: AuthSecret,
|
||||
info: ComputeUserInfo,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
@@ -421,7 +421,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
|
||||
pub async fn authenticate(
|
||||
self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
@@ -467,7 +467,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
impl BackendType<'_, ComputeUserInfo, &()> {
|
||||
pub async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
) -> Result<CachedRoleSecret, GetAuthInfoError> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
@@ -478,7 +478,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
|
||||
|
||||
pub async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
@@ -492,7 +492,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
|
||||
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
use BackendType::*;
|
||||
|
||||
@@ -514,7 +514,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
|
||||
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
use BackendType::*;
|
||||
|
||||
@@ -571,7 +571,7 @@ mod tests {
|
||||
impl console::Api for Auth {
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
|
||||
Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
|
||||
@@ -579,7 +579,7 @@ mod tests {
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
|
||||
{
|
||||
@@ -591,7 +591,7 @@ mod tests {
|
||||
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
unimplemented!()
|
||||
@@ -665,7 +665,7 @@ mod tests {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
@@ -723,7 +723,7 @@ mod tests {
|
||||
));
|
||||
|
||||
let _creds = auth_quirks(
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
&api,
|
||||
user_info,
|
||||
&mut stream,
|
||||
@@ -742,7 +742,7 @@ mod tests {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
@@ -775,7 +775,7 @@ mod tests {
|
||||
));
|
||||
|
||||
let _creds = auth_quirks(
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
&api,
|
||||
user_info,
|
||||
&mut stream,
|
||||
@@ -794,7 +794,7 @@ mod tests {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
@@ -828,7 +828,7 @@ mod tests {
|
||||
));
|
||||
|
||||
let creds = auth_quirks(
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
&api,
|
||||
user_info,
|
||||
&mut stream,
|
||||
|
||||
@@ -12,7 +12,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
creds: ComputeUserInfo,
|
||||
client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
config: &'static AuthenticationConfig,
|
||||
@@ -27,7 +27,7 @@ pub(super) async fn authenticate(
|
||||
}
|
||||
AuthSecret::Scram(secret) => {
|
||||
info!("auth endpoint chooses SCRAM");
|
||||
let scram = auth::Scram(&secret, ctx);
|
||||
let scram = auth::Scram(&secret, &mut *ctx);
|
||||
|
||||
let auth_outcome = tokio::time::timeout(
|
||||
config.scram_protocol_timeout,
|
||||
|
||||
@@ -18,7 +18,7 @@ use tracing::{info, warn};
|
||||
/// These properties are benefical for serverless JS workers, so we
|
||||
/// use this mechanism for websocket connections.
|
||||
pub async fn authenticate_cleartext(
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
info: ComputeUserInfo,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
secret: AuthSecret,
|
||||
@@ -28,7 +28,7 @@ pub async fn authenticate_cleartext(
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
|
||||
let ep = EndpointIdInt::from(&info.endpoint);
|
||||
|
||||
@@ -60,7 +60,7 @@ pub async fn authenticate_cleartext(
|
||||
/// Similar to [`authenticate_cleartext`], but there's a specific password format,
|
||||
/// and passwords are not yet validated (we don't know how to validate them!)
|
||||
pub async fn password_hack_no_authentication(
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
info: ComputeUserInfoNoEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
) -> auth::Result<ComputeCredentials> {
|
||||
@@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication(
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
|
||||
@@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String {
|
||||
}
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
link_uri: &reqwest::Url,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<NodeInfo> {
|
||||
|
||||
@@ -84,7 +84,7 @@ pub fn endpoint_sni(
|
||||
|
||||
impl ComputeUserInfoMaybeEndpoint {
|
||||
pub fn parse(
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
params: &StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
common_names: Option<&HashSet<String>>,
|
||||
@@ -249,8 +249,8 @@ mod tests {
|
||||
fn parse_bare_minimum() -> anyhow::Result<()> {
|
||||
// According to postgresql, only `user` should be required.
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id, None);
|
||||
|
||||
@@ -264,8 +264,8 @@ mod tests {
|
||||
("database", "world"), // should be ignored
|
||||
("foo", "bar"), // should be ignored
|
||||
]);
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id, None);
|
||||
|
||||
@@ -279,9 +279,9 @@ mod tests {
|
||||
let sni = Some("foo.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
|
||||
assert_eq!(user_info.options.get_cache_key("foo"), "foo");
|
||||
@@ -296,8 +296,8 @@ mod tests {
|
||||
("options", "-ckey=1 project=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
|
||||
|
||||
@@ -311,8 +311,8 @@ mod tests {
|
||||
("options", "-ckey=1 endpoint=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
|
||||
|
||||
@@ -329,8 +329,8 @@ mod tests {
|
||||
),
|
||||
]);
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert!(user_info.endpoint_id.is_none());
|
||||
|
||||
@@ -344,8 +344,8 @@ mod tests {
|
||||
("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
|
||||
]);
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert!(user_info.endpoint_id.is_none());
|
||||
|
||||
@@ -359,9 +359,9 @@ mod tests {
|
||||
let sni = Some("baz.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));
|
||||
|
||||
@@ -374,16 +374,16 @@ mod tests {
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.a.com");
|
||||
let ctx = RequestMonitoring::test();
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.b.com");
|
||||
let ctx = RequestMonitoring::test();
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
|
||||
|
||||
Ok(())
|
||||
@@ -397,9 +397,10 @@ mod tests {
|
||||
let sni = Some("second.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
|
||||
.expect_err("should fail");
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let err =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
|
||||
.expect_err("should fail");
|
||||
match err {
|
||||
InconsistentProjectNames { domain, option } => {
|
||||
assert_eq!(option, "first");
|
||||
@@ -416,9 +417,10 @@ mod tests {
|
||||
let sni = Some("project.localhost");
|
||||
let common_names = Some(["example.com".into()].into());
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
|
||||
.expect_err("should fail");
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let err =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
|
||||
.expect_err("should fail");
|
||||
match err {
|
||||
UnknownCommonName { cn } => {
|
||||
assert_eq!(cn, "localhost");
|
||||
@@ -436,9 +438,9 @@ mod tests {
|
||||
|
||||
let sni = Some("project.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
let ctx = RequestMonitoring::test();
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
|
||||
assert_eq!(
|
||||
user_info.options.get_cache_key("project"),
|
||||
|
||||
@@ -27,7 +27,7 @@ pub trait AuthMethod {
|
||||
pub struct Begin;
|
||||
|
||||
/// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
|
||||
pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
|
||||
pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);
|
||||
|
||||
impl AuthMethod for Scram<'_> {
|
||||
#[inline(always)]
|
||||
@@ -155,7 +155,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
let Scram(secret, ctx) = self.state;
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
|
||||
// Initial client message contains the chosen auth method's name.
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
@@ -168,8 +168,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
}
|
||||
|
||||
match sasl.method {
|
||||
SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256),
|
||||
SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
|
||||
SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
|
||||
SCRAM_SHA_256_PLUS => {
|
||||
ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
info!("client chooses {}", sasl.method);
|
||||
|
||||
@@ -205,7 +205,7 @@ async fn task_main(
|
||||
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
|
||||
|
||||
async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
raw_stream: S,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
tls_server_end_point: TlsServerEndPoint,
|
||||
@@ -256,13 +256,13 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
}
|
||||
|
||||
async fn handle_client(
|
||||
ctx: RequestMonitoring,
|
||||
mut ctx: RequestMonitoring,
|
||||
dest_suffix: Arc<String>,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
tls_server_end_point: TlsServerEndPoint,
|
||||
stream: impl AsyncRead + AsyncWrite + Unpin,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?;
|
||||
let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;
|
||||
|
||||
// Cut off first part of the SNI domain
|
||||
// We receive required destination details in the format of
|
||||
|
||||
@@ -5,7 +5,6 @@ use aws_config::meta::region::RegionProviderChain;
|
||||
use aws_config::profile::ProfileFileCredentialsProvider;
|
||||
use aws_config::provider_config::ProviderConfig;
|
||||
use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
|
||||
use aws_config::Region;
|
||||
use futures::future::Either;
|
||||
use proxy::auth;
|
||||
use proxy::auth::backend::AuthRateLimiter;
|
||||
@@ -291,10 +290,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
let config = build_config(&args)?;
|
||||
|
||||
info!("Authentication backend: {}", config.auth_backend);
|
||||
info!("Using region: {}", args.aws_region);
|
||||
info!("Using region: {}", config.aws_region);
|
||||
|
||||
let region_provider =
|
||||
RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
|
||||
let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
|
||||
let provider_conf =
|
||||
ProviderConfig::without_region().with_region(region_provider.region().await);
|
||||
let aws_credentials_provider = {
|
||||
@@ -320,7 +318,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
};
|
||||
let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
|
||||
elasticache::AWSIRSAConfig::new(
|
||||
args.aws_region.clone(),
|
||||
config.aws_region.clone(),
|
||||
args.redis_cluster_name,
|
||||
args.redis_user_id,
|
||||
),
|
||||
@@ -378,14 +376,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let cancel_map = CancelMap::default();
|
||||
|
||||
let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
|
||||
RateBucketInfo::validate(redis_rps_limit)?;
|
||||
|
||||
let redis_publisher = match ®ional_redis_client {
|
||||
Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
|
||||
redis_publisher.clone(),
|
||||
args.region.clone(),
|
||||
redis_rps_limit,
|
||||
&config.redis_rps_limit,
|
||||
)?))),
|
||||
None => None,
|
||||
};
|
||||
@@ -661,6 +656,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
)?;
|
||||
|
||||
let http_config = HttpConfig {
|
||||
request_timeout: args.sql_over_http.sql_over_http_timeout,
|
||||
pool_options: GlobalConnPoolOptions {
|
||||
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
|
||||
gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
|
||||
@@ -680,6 +676,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
|
||||
};
|
||||
|
||||
let mut redis_rps_limit = args.redis_rps_limit.clone();
|
||||
RateBucketInfo::validate(&mut redis_rps_limit)?;
|
||||
|
||||
let config = Box::leak(Box::new(ProxyConfig {
|
||||
tls_config,
|
||||
auth_backend,
|
||||
@@ -688,8 +687,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
http_config,
|
||||
authentication_config,
|
||||
require_client_ip: args.require_client_ip,
|
||||
disable_ip_check_for_http: args.disable_ip_check_for_http,
|
||||
redis_rps_limit,
|
||||
handshake_timeout: args.handshake_timeout,
|
||||
region: args.region.clone(),
|
||||
aws_region: args.aws_region.clone(),
|
||||
wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
|
||||
connect_compute_locks,
|
||||
connect_to_compute_retry_config: config::RetryConfig::parse(
|
||||
|
||||
2
proxy/src/cache/endpoints.rs
vendored
2
proxy/src/cache/endpoints.rs
vendored
@@ -68,7 +68,7 @@ impl EndpointsCache {
|
||||
ready: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
|
||||
pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
|
||||
if !self.ready.load(Ordering::Acquire) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -288,12 +288,12 @@ impl ConnCfg {
|
||||
/// Connect to a corresponding compute node.
|
||||
pub async fn connect(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
allow_self_signed_compute: bool,
|
||||
aux: MetricsAuxInfo,
|
||||
timeout: Duration,
|
||||
) -> Result<PostgresConnection, ConnectionError> {
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
|
||||
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
|
||||
drop(pause);
|
||||
|
||||
@@ -316,14 +316,14 @@ impl ConnCfg {
|
||||
)?;
|
||||
|
||||
// connect_raw() will not use TLS if sslmode is "disable"
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
|
||||
let (client, connection) = self.0.connect_raw(stream, tls).await?;
|
||||
drop(pause);
|
||||
tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
|
||||
let stream = connection.stream.into_inner();
|
||||
|
||||
info!(
|
||||
cold_start_info = ctx.cold_start_info().as_str(),
|
||||
cold_start_info = ctx.cold_start_info.as_str(),
|
||||
"connected to compute node at {host} ({socket_addr}) sslmode={:?}",
|
||||
self.0.get_ssl_mode()
|
||||
);
|
||||
@@ -342,7 +342,7 @@ impl ConnCfg {
|
||||
params,
|
||||
cancel_closure,
|
||||
aux,
|
||||
_guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
|
||||
_guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
|
||||
};
|
||||
|
||||
Ok(connection)
|
||||
|
||||
@@ -31,8 +31,11 @@ pub struct ProxyConfig {
|
||||
pub http_config: HttpConfig,
|
||||
pub authentication_config: AuthenticationConfig,
|
||||
pub require_client_ip: bool,
|
||||
pub disable_ip_check_for_http: bool,
|
||||
pub redis_rps_limit: Vec<RateBucketInfo>,
|
||||
pub region: String,
|
||||
pub handshake_timeout: Duration,
|
||||
pub aws_region: String,
|
||||
pub wake_compute_retry_config: RetryConfig,
|
||||
pub connect_compute_locks: ApiLocks<Host>,
|
||||
pub connect_to_compute_retry_config: RetryConfig,
|
||||
@@ -52,6 +55,7 @@ pub struct TlsConfig {
|
||||
}
|
||||
|
||||
pub struct HttpConfig {
|
||||
pub request_timeout: tokio::time::Duration,
|
||||
pub pool_options: GlobalConnPoolOptions,
|
||||
pub cancel_set: CancelSet,
|
||||
pub client_conn_threshold: u64,
|
||||
|
||||
@@ -292,7 +292,7 @@ pub struct NodeInfo {
|
||||
impl NodeInfo {
|
||||
pub async fn connect(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
timeout: Duration,
|
||||
) -> Result<compute::PostgresConnection, compute::ConnectionError> {
|
||||
self.config
|
||||
@@ -330,20 +330,20 @@ pub(crate) trait Api {
|
||||
/// We still have to mock the scram to avoid leaking information that user doesn't exist.
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
|
||||
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
|
||||
}
|
||||
@@ -363,7 +363,7 @@ pub enum ConsoleBackend {
|
||||
impl Api for ConsoleBackend {
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
|
||||
use ConsoleBackend::*;
|
||||
@@ -378,7 +378,7 @@ impl Api for ConsoleBackend {
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
|
||||
use ConsoleBackend::*;
|
||||
@@ -393,7 +393,7 @@ impl Api for ConsoleBackend {
|
||||
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, errors::WakeComputeError> {
|
||||
use ConsoleBackend::*;
|
||||
|
||||
@@ -158,7 +158,7 @@ impl super::Api for Api {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, GetAuthInfoError> {
|
||||
Ok(CachedRoleSecret::new_uncached(
|
||||
@@ -168,7 +168,7 @@ impl super::Api for Api {
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
|
||||
Ok((
|
||||
@@ -182,7 +182,7 @@ impl super::Api for Api {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
self.do_wake_compute().map_ok(Cached::new_uncached).await
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user