mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 11:40:38 +00:00
Merge pull request #8451 from neondatabase/rc/2024-07-22
## Storage & Compute release 2024-07-22 This PR has so many commits because the release branch diverged from `main`. Details https://neondb.slack.com/archives/C033A2WE6BZ/p1721650938949059?thread_ts=1721308848.034069&cid=C033A2WE6BZ The commit range that is truly new since the last storage release are the the `main` commit which I cherry-picked using this command ``` git cherry-pick 8a8b83df27383a07bb7dbba519325c15d2f46357..4e547e6 ```
This commit is contained in:
@@ -9,8 +9,8 @@ inputs:
|
||||
description: 'Region ID, if not set the project will be created in the default region'
|
||||
default: aws-us-east-2
|
||||
postgres_version:
|
||||
description: 'Postgres version; default is 15'
|
||||
default: '15'
|
||||
description: 'Postgres version; default is 16'
|
||||
default: '16'
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
|
||||
97
.github/workflows/benchmarking.yml
vendored
97
.github/workflows/benchmarking.yml
vendored
@@ -57,9 +57,10 @@ jobs:
|
||||
bench:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- DEFAULT_PG_VERSION: 14
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "neon-staging"
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
provisioner: 'k8s-pod'
|
||||
@@ -146,6 +147,7 @@ jobs:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
replication-tests:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
@@ -190,6 +192,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -215,11 +218,14 @@ jobs:
|
||||
# Available platforms:
|
||||
# - neon-captest-new: Freshly created project (1 CU)
|
||||
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
|
||||
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
|
||||
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
|
||||
# - neon-captest-reuse: Reusing existing project
|
||||
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
env:
|
||||
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
|
||||
DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
|
||||
@@ -230,23 +236,33 @@ jobs:
|
||||
- name: Generate matrix for pgbench benchmark
|
||||
id: pgbench-compare-matrix
|
||||
run: |
|
||||
region_id_default=${{ env.DEFAULT_REGION_ID }}
|
||||
matrix='{
|
||||
"pg_version" : [
|
||||
16
|
||||
],
|
||||
"region_id" : [
|
||||
"'"$region_id_default"'"
|
||||
],
|
||||
"platform": [
|
||||
"neon-captest-new",
|
||||
"neon-captest-reuse",
|
||||
"neonvm-captest-new"
|
||||
],
|
||||
"db_size": [ "10gb" ],
|
||||
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
|
||||
{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -298,7 +314,7 @@ jobs:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
@@ -323,14 +339,14 @@ jobs:
|
||||
prefix: latest
|
||||
|
||||
- name: Create Neon Project
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
region_id: ${{ matrix.region_id }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
|
||||
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
|
||||
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
|
||||
|
||||
- name: Set up Connection String
|
||||
@@ -343,7 +359,7 @@ jobs:
|
||||
neonvm-captest-sharding-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
||||
;;
|
||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -368,6 +384,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -381,6 +398,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -394,6 +412,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -420,6 +439,13 @@ jobs:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
pgbench-pgvector:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- PLATFORM: "neon-captest-pgvector"
|
||||
- PLATFORM: "azure-captest-pgvector"
|
||||
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "1"
|
||||
@@ -427,8 +453,9 @@ jobs:
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-captest-pgvector"
|
||||
PLATFORM: ${{ matrix.PLATFORM }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
@@ -438,17 +465,39 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
# until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
|
||||
# instead of using Neon artifacts containing pgbench
|
||||
- name: Install postgresql-16 where pytest expects it
|
||||
run: |
|
||||
cd /home/nonroot
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb
|
||||
dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
|
||||
mkdir -p /tmp/neon/pg_install/v16/bin
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
|
||||
ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib
|
||||
/tmp/neon/pg_install/v16/bin/pgbench --version
|
||||
/tmp/neon/pg_install/v16/bin/psql --version
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-pgvector)
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||
;;
|
||||
azure-captest-pgvector)
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
@@ -460,6 +509,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -473,6 +523,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -487,11 +538,10 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
|
||||
clickbench-compare:
|
||||
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
|
||||
# we use for performance testing in pgbench-compare.
|
||||
@@ -735,6 +785,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
79
Cargo.lock
generated
79
Cargo.lock
generated
@@ -1368,6 +1368,7 @@ dependencies = [
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"whoami",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -3233,16 +3234,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
|
||||
dependencies = [
|
||||
"overload",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num"
|
||||
version = "0.4.1"
|
||||
@@ -3538,12 +3529,6 @@ version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
|
||||
|
||||
[[package]]
|
||||
name = "overload"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "p256"
|
||||
version = "0.11.1"
|
||||
@@ -4404,6 +4389,7 @@ dependencies = [
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"typed-json",
|
||||
"url",
|
||||
"urlencoding",
|
||||
"utils",
|
||||
@@ -4602,6 +4588,15 @@ dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.10.2"
|
||||
@@ -5811,6 +5806,28 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storage_controller_client"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres",
|
||||
"reqwest 0.12.4",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storage_scrubber"
|
||||
version = "0.1.0"
|
||||
@@ -5845,6 +5862,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"storage_controller_client",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
@@ -5874,6 +5892,7 @@ dependencies = [
|
||||
"reqwest 0.12.4",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"storage_controller_client",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -6600,7 +6619,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"serde",
|
||||
@@ -6665,6 +6683,16 @@ dependencies = [
|
||||
"static_assertions",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typed-json"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.16.0"
|
||||
@@ -6961,6 +6989,12 @@ version = "0.11.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasite"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.92"
|
||||
@@ -7113,6 +7147,17 @@ dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "whoami"
|
||||
version = "1.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
|
||||
dependencies = [
|
||||
"redox_syscall 0.4.1",
|
||||
"wasite",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
|
||||
@@ -13,6 +13,7 @@ members = [
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
"storage_controller",
|
||||
"storage_controller/client",
|
||||
"storage_scrubber",
|
||||
"workspace_hack",
|
||||
"libs/compute_api",
|
||||
@@ -182,14 +183,16 @@ tower-service = "0.3.2"
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.21.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
typed-json = "0.1"
|
||||
url = "2.2"
|
||||
urlencoding = "2.1"
|
||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
rustls-native-certs = "0.7"
|
||||
x509-parser = "0.15"
|
||||
whoami = "1.5.1"
|
||||
|
||||
## TODO replace this with tracing
|
||||
env_logger = "0.10"
|
||||
@@ -219,6 +222,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
||||
desim = { version = "0.1", path = "./libs/desim" }
|
||||
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||
storage_controller_client = { path = "./storage_controller/client" }
|
||||
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
|
||||
@@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
|
||||
FROM build-deps AS rum-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY patches/rum.patch /rum.patch
|
||||
|
||||
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
|
||||
echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
|
||||
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /rum.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
|
||||
|
||||
@@ -9,6 +9,9 @@ pub(crate) struct MigrationRunner<'m> {
|
||||
|
||||
impl<'m> MigrationRunner<'m> {
|
||||
pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
|
||||
// The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
|
||||
assert!(migrations.len() + 1 < i64::MAX as usize);
|
||||
|
||||
Self { client, migrations }
|
||||
}
|
||||
|
||||
@@ -22,11 +25,8 @@ impl<'m> MigrationRunner<'m> {
|
||||
Ok(row.get::<&str, i64>("id"))
|
||||
}
|
||||
|
||||
fn update_migration_id(&mut self) -> Result<()> {
|
||||
let setval = format!(
|
||||
"UPDATE neon_migration.migration_id SET id={}",
|
||||
self.migrations.len()
|
||||
);
|
||||
fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
|
||||
let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
|
||||
|
||||
self.client
|
||||
.simple_query(&setval)
|
||||
@@ -57,44 +57,49 @@ impl<'m> MigrationRunner<'m> {
|
||||
pub fn run_migrations(mut self) -> Result<()> {
|
||||
self.prepare_migrations()?;
|
||||
|
||||
let mut current_migration: usize = self.get_migration_id()? as usize;
|
||||
let starting_migration_id = current_migration;
|
||||
|
||||
let query = "BEGIN";
|
||||
self.client
|
||||
.simple_query(query)
|
||||
.context("run_migrations begin")?;
|
||||
|
||||
let mut current_migration = self.get_migration_id()? as usize;
|
||||
while current_migration < self.migrations.len() {
|
||||
macro_rules! migration_id {
|
||||
($cm:expr) => {
|
||||
($cm + 1) as i64
|
||||
};
|
||||
}
|
||||
|
||||
let migration = self.migrations[current_migration];
|
||||
|
||||
if migration.starts_with("-- SKIP") {
|
||||
info!("Skipping migration id={}", current_migration);
|
||||
info!("Skipping migration id={}", migration_id!(current_migration));
|
||||
} else {
|
||||
info!(
|
||||
"Running migration id={}:\n{}\n",
|
||||
current_migration, migration
|
||||
migration_id!(current_migration),
|
||||
migration
|
||||
);
|
||||
|
||||
self.client
|
||||
.simple_query("BEGIN")
|
||||
.context("begin migration")?;
|
||||
|
||||
self.client.simple_query(migration).with_context(|| {
|
||||
format!("run_migration current_migration={}", current_migration)
|
||||
format!(
|
||||
"run_migrations migration id={}",
|
||||
migration_id!(current_migration)
|
||||
)
|
||||
})?;
|
||||
|
||||
// Migration IDs start at 1
|
||||
self.update_migration_id(migration_id!(current_migration))?;
|
||||
|
||||
self.client
|
||||
.simple_query("COMMIT")
|
||||
.context("commit migration")?;
|
||||
|
||||
info!("Finished migration id={}", migration_id!(current_migration));
|
||||
}
|
||||
|
||||
current_migration += 1;
|
||||
}
|
||||
|
||||
self.update_migration_id()?;
|
||||
|
||||
let query = "COMMIT";
|
||||
self.client
|
||||
.simple_query(query)
|
||||
.context("run_migrations commit")?;
|
||||
|
||||
info!(
|
||||
"Ran {} migrations",
|
||||
(self.migrations.len() - starting_migration_id)
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -777,21 +777,21 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
|
||||
// Add new migrations in numerical order.
|
||||
let migrations = [
|
||||
include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
|
||||
include_str!("./migrations/0001-alter_roles.sql"),
|
||||
include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
|
||||
include_str!("./migrations/0002-alter_roles.sql"),
|
||||
include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
|
||||
include_str!(
|
||||
"./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
|
||||
"./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
|
||||
),
|
||||
include_str!(
|
||||
"./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
|
||||
"./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
|
||||
),
|
||||
include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
|
||||
include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
|
||||
include_str!(
|
||||
"./migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
|
||||
"./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
|
||||
),
|
||||
];
|
||||
|
||||
|
||||
@@ -40,6 +40,7 @@ safekeeper_api.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
storage_broker.workspace = true
|
||||
utils.workspace = true
|
||||
whoami.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
//! Code to manage the storage broker
|
||||
//!
|
||||
//! In the local test environment, the data for each safekeeper is stored in
|
||||
//! In the local test environment, the storage broker stores its data directly in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! .neon
|
||||
//! ```
|
||||
use std::time::Duration;
|
||||
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
//! Code to manage pageservers
|
||||
//!
|
||||
//! In the local test environment, the pageserver stores its data directly in
|
||||
//! In the local test environment, the data for each pageserver is stored in
|
||||
//!
|
||||
//! .neon/
|
||||
//! ```text
|
||||
//! .neon/pageserver_<pageserver_id>
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
|
||||
|
||||
@@ -29,7 +29,6 @@ use utils::{
|
||||
pub struct StorageController {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: Utf8PathBuf,
|
||||
private_key: Option<Vec<u8>>,
|
||||
public_key: Option<String>,
|
||||
postgres_port: u16,
|
||||
@@ -41,6 +40,8 @@ const COMMAND: &str = "storage_controller";
|
||||
|
||||
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -65,10 +66,6 @@ pub struct InspectResponse {
|
||||
|
||||
impl StorageController {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("attachments.json");
|
||||
|
||||
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||
// pageservers have control plane API set
|
||||
let listen_url = env.control_plane_api.clone().unwrap();
|
||||
@@ -128,7 +125,6 @@ impl StorageController {
|
||||
|
||||
Self {
|
||||
env: env.clone(),
|
||||
path,
|
||||
listen,
|
||||
private_key,
|
||||
public_key,
|
||||
@@ -203,7 +199,6 @@ impl StorageController {
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
@@ -232,6 +227,30 @@ impl StorageController {
|
||||
Ok(database_url)
|
||||
}
|
||||
|
||||
pub async fn connect_to_database(
|
||||
&self,
|
||||
) -> anyhow::Result<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
)> {
|
||||
tokio_postgres::Config::new()
|
||||
.host("localhost")
|
||||
.port(self.postgres_port)
|
||||
// The user is the ambient operating system user name.
|
||||
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
|
||||
//
|
||||
// Until we get there, use the ambient operating system user name.
|
||||
// Recent tokio-postgres versions default to this if the user isn't specified.
|
||||
// But tokio-postgres fork doesn't have this upstream commit:
|
||||
// https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
|
||||
// => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
|
||||
.user(&whoami::username())
|
||||
.dbname(DB_NAME)
|
||||
.connect(tokio_postgres::NoTls)
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
@@ -256,18 +275,21 @@ impl StorageController {
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
};
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
//
|
||||
// NB: it's important that we rewrite this file on each start command so we propagate changes
|
||||
// from `LocalEnv`'s config file (`.neon/config`).
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
@@ -296,11 +318,38 @@ impl StorageController {
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
// We support running a startup SQL script to fiddle with the database before we launch storcon.
|
||||
// This is used by the test suite.
|
||||
let startup_script_path = self
|
||||
.env
|
||||
.base_data_dir
|
||||
.join("storage_controller_db.startup.sql");
|
||||
let startup_script = match tokio::fs::read_to_string(&startup_script_path).await {
|
||||
Ok(script) => {
|
||||
tokio::fs::remove_file(startup_script_path).await?;
|
||||
script
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// always run some startup script so that this code path doesn't bit rot
|
||||
"BEGIN; COMMIT;".to_string()
|
||||
} else {
|
||||
anyhow::bail!("Failed to read startup script: {e}")
|
||||
}
|
||||
}
|
||||
};
|
||||
let (mut client, conn) = self.connect_to_database().await?;
|
||||
let conn = tokio::spawn(conn);
|
||||
let tx = client.build_transaction();
|
||||
let tx = tx.start().await?;
|
||||
tx.batch_execute(&startup_script).await?;
|
||||
tx.commit().await?;
|
||||
drop(client);
|
||||
conn.await??;
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
"-p",
|
||||
self.path.as_ref(),
|
||||
"--dev",
|
||||
"--database-url",
|
||||
&database_url,
|
||||
|
||||
@@ -17,6 +17,7 @@ pageserver_client.workspace = true
|
||||
reqwest.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
storage_controller_client.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -14,15 +14,15 @@ use pageserver_api::{
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
||||
use pageserver_client::mgmt_api::{self};
|
||||
use reqwest::{Method, StatusCode, Url};
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
use storage_controller_client::control_api::Client;
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Command {
|
||||
@@ -249,64 +249,6 @@ impl FromStr for NodeAvailabilityArg {
|
||||
}
|
||||
}
|
||||
|
||||
struct Client {
|
||||
base_url: Url,
|
||||
jwt_token: Option<String>,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
|
||||
Self {
|
||||
base_url,
|
||||
jwt_token,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into storage controller
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> mgmt_api::Result<RS>
|
||||
where
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
self.base_url.host_str().unwrap(),
|
||||
self.base_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
builder = builder.json(&body)
|
||||
}
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
builder = builder.header(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
format!("Bearer {jwt_token}"),
|
||||
);
|
||||
}
|
||||
|
||||
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
|
||||
let response = response.error_from_body().await?;
|
||||
|
||||
response
|
||||
.json()
|
||||
.await
|
||||
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
252
docs/rfcs/034-ancestor-deletion.md
Normal file
252
docs/rfcs/034-ancestor-deletion.md
Normal file
@@ -0,0 +1,252 @@
|
||||
# Ancestor Timeline Deletion
|
||||
|
||||
Created on: 2024-02-23
|
||||
|
||||
Author: John Spray
|
||||
|
||||
# Summary
|
||||
|
||||
When a tenant creates a new timeline that they will treat as their 'main' history,
|
||||
it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
|
||||
this is necessary because it is forbidden to delete a timeline which has descendents.
|
||||
|
||||
A new pageserver API is proposed to 'adopt' data from a parent timeline into
|
||||
one of its children, such that the link between ancestor and child can be severed,
|
||||
leaving the parent in a state where it may then be deleted.
|
||||
|
||||
# Motivation
|
||||
|
||||
Retaining parent timelines currently has two costs:
|
||||
|
||||
- Cognitive load on users, who have to remember which is the "real" main timeline.
|
||||
- Storage capacity cost, as the parent timeline will retain layers up to the
|
||||
child's timeline point, even if the child fully covers its keyspace with image
|
||||
layers and will never actually read from the parent.
|
||||
|
||||
# Solution
|
||||
|
||||
A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
|
||||
will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
|
||||
wish to detach from its parent.
|
||||
|
||||
On success, this API will leave the following state:
|
||||
|
||||
- The detached child timeline will no longer have an ancestor, and will contain all
|
||||
the data needed to service reads without recursing into an ancestor.
|
||||
- Any other children of the parent whose timeline points were at a lower LSN than
|
||||
the detached child timeline will be modified to have the child timeline as their
|
||||
new parent.
|
||||
- The parent timeline will still exist, but the child will no longer have it as an
|
||||
ancestor. If this was the last timeline that depended on the parent, then the
|
||||
parent will become deletable.
|
||||
|
||||
This API's implementation will consist of a series of retryable steps, such that
|
||||
on failures/timeout it can safely be called again to reach the target state.
|
||||
|
||||
## Example
|
||||
|
||||
### Before
|
||||
|
||||
The user has "rolled back" their project to LSN X, resulting in a "new main"
|
||||
timeline. The parent "old main" timeline still exists, and they would like
|
||||
to clean it up.
|
||||
|
||||
They have two other timelines A and B. A is from before the rollback point,
|
||||
and B is from after the rollback point.
|
||||
|
||||
```
|
||||
----"old main" timeline-------X-------------------------------------------->
|
||||
| | |
|
||||
|-> child A | |
|
||||
|-> "new main" timeline |
|
||||
-> child B
|
||||
|
||||
```
|
||||
|
||||
### After calling detach ancestor API
|
||||
|
||||
The "new main" timeline is no longer dependent on old main, and neither
|
||||
is child A, because it had a branch point before X.
|
||||
|
||||
The user may now choose to delete child B and "old main" to get to
|
||||
a pristine state. Child B is likely to be unwanted since the user
|
||||
chose to roll back to X, and it branches from after X. However, we
|
||||
don't assume this in the API; it is up to the user to delete it.
|
||||
|
||||
```
|
||||
|----"old main" timeline---------------------------------------------------->
|
||||
|
|
||||
|
|
||||
|
|
||||
-> child B
|
||||
|
||||
|----"new main" timeline--------->
|
||||
|
|
||||
|-> child A
|
||||
|
||||
|
||||
```
|
||||
|
||||
### After removing timelines
|
||||
|
||||
We end up with a totally clean state that leaves no trace that a rollback
|
||||
ever happened: there is only one root timeline.
|
||||
|
||||
```
|
||||
| ----"new main" timeline----------->
|
||||
|
|
||||
|-> child A
|
||||
|
||||
|
||||
```
|
||||
|
||||
## Caveats
|
||||
|
||||
Important things for API users to bear in mind:
|
||||
|
||||
- this API does not delete the parent timeline: you must still do that explicitly.
|
||||
- if there are other child timelines ahead of the branch point of the detached
|
||||
child, the parent won't be deletable: you must either delete or detach those
|
||||
children.
|
||||
- do _not_ simply loop over all children and detach them all: this can have an
|
||||
extremely high storage cost. The detach ancestor API is intended for use on a single
|
||||
timeline to make it the new "main".
|
||||
- The detach ancestor API should also not be
|
||||
exposed directly to the user as button/API, because they might decide
|
||||
to click it for all the children and thereby generate many copies of the
|
||||
parent's data -- the detach ancestor API should be used as part
|
||||
of a high level "clean up after rollback" feature.
|
||||
|
||||
## `detach_ancestor` API implementation
|
||||
|
||||
Terms used in the following sections:
|
||||
|
||||
- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
|
||||
called "new main" in the example.
|
||||
- "the parent": the parent of "the child". Also called "old main" in the example.
|
||||
- "the branch point" the ancestor_lsn of "the child"
|
||||
|
||||
### Phase 1: write out adopted layers to S3
|
||||
|
||||
The child will "adopt" layers from the parent, such that its end state contains
|
||||
all the parent's history as well as its own.
|
||||
|
||||
For all layers in the parent's layer map whose high LSN is below the branch
|
||||
point, issue S3 CopyObject requests to duplicate them into the child timeline's
|
||||
prefix. Do not add them to the child's layer map yet.
|
||||
|
||||
For delta layers in the parent's layer map which straddle the branch point, read them
|
||||
and write out only content up to the branch point into new layer objects.
|
||||
|
||||
This is a long running operation if the parent has many layers: it should be
|
||||
implemented in a way that resumes rather than restarting from scratch, if the API
|
||||
times out and is called again.
|
||||
|
||||
As an optimization, if there are no other timelines that will be adopted into
|
||||
the child, _and_ the child's image layers already full cover the branch LSN,
|
||||
then we may skip adopting layers.
|
||||
|
||||
### Phase 2: update the child's index
|
||||
|
||||
Having written out all needed layers in phase 1, atomically link them all
|
||||
into the child's IndexPart and upload to S3. This may be done while the
|
||||
child Timeline is still running.
|
||||
|
||||
### Phase 3: modify timelines ancestry
|
||||
|
||||
Modify the child's ancestor to None, and upload its IndexPart to persist the change.
|
||||
|
||||
For all timelines which have the same parent as the child, and have a branch
|
||||
point lower than our branch point, switch their ancestor_timeline to the child,
|
||||
and upload their IndexPart to persist the change.
|
||||
|
||||
## Alternatives considered
|
||||
|
||||
### Generate full image layer on child, rather than adopting parent deltas
|
||||
|
||||
This would work for the case of a single child, but would prevent re-targeting
|
||||
other timelines that depended on the parent. If we detached many children this
|
||||
way, the storage cost would become prohibitive (consider a 1TB database with
|
||||
100 child timelines: it would cost 100TiB if they all generated their own image layers).
|
||||
|
||||
### Don't rewrite anything: just fake it in the API
|
||||
|
||||
We could add a layer of indirection that let a child "pretend" that it had no
|
||||
ancestor, when in reality it still had the parent. The pageserver API could
|
||||
accept deletion of ancestor timelines, and just update child metadata to make
|
||||
them look like they have no ancestor.
|
||||
|
||||
This would not achieve the desired reduction in storage cost, and may well be more
|
||||
complex to maintain than simply implementing the API described in this RFC.
|
||||
|
||||
### Avoid copying objects: enable child index to use parent layers directly
|
||||
|
||||
We could teach IndexPart to store a TimelineId for each layer, such that a child
|
||||
timeline could reference a parent's layers directly, rather than copying them
|
||||
into the child's prefix.
|
||||
|
||||
This would impose a cost for the normal case of indices that only target the
|
||||
timeline's own layers, add complexity, and break the useful simplifying
|
||||
invariant that timelines "own" their own path. If child timelines were
|
||||
referencing layers from the parent, we would have to ensure that the parent
|
||||
never runs GC/compaction again, which would make the API less flexible (the
|
||||
proposal in this RFC enables deletion of the parent but doesn't require it.)
|
||||
|
||||
## Performance
|
||||
|
||||
### Adopting layers
|
||||
|
||||
- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
|
||||
of such requests: this can take up to tens of seconds and will compete for RemoteStorage
|
||||
semaphore units with other activity on the pageserver.
|
||||
- If we are running on storage backend that doesn't implement CopyObject, then
|
||||
this part will be much more expensive as we would stream all layer content
|
||||
through the pageserver. This is no different to issuing a lot
|
||||
of reads to a timeline that does not have a warm local cache: it will move
|
||||
a lot of gigabytes, but that shouldn't break anything.
|
||||
- Generating truncated layers for delta that straddle the branch point will
|
||||
require streaming read/write of all the layers in question.
|
||||
|
||||
### Updating timeline ancestry
|
||||
|
||||
The simplest way to update timeline ancestry will probably be to stop and start
|
||||
all the Timeline objects: this is preferable to the complexity of making their
|
||||
ancestry mutable at runtime.
|
||||
|
||||
There will be a corresponding "stutter" in the availability of the timelines,
|
||||
of the order 10-100ms, which is the time taken to upload their IndexPart, and
|
||||
restart the Timeline.
|
||||
|
||||
# Interaction with other features
|
||||
|
||||
## Concurrent timeline creation
|
||||
|
||||
If new historic timelines are created using the parent as an ancestor while the
|
||||
detach ancestor API is running, they will not be re-parented to the child. This
|
||||
doesn't break anything, but it leaves the parent in a state where it might not
|
||||
be possible to delete it.
|
||||
|
||||
Since timeline creations are an explicit user action, this is not something we need to
|
||||
worry about as the storage layer: a user who wants to delete their parent timeline will not create
|
||||
new children, and if they do, they can choose to delete those children to
|
||||
enable deleting the parent.
|
||||
|
||||
For the least surprise to the user, before starting the detach ancestor branch
|
||||
operation, the control plane should wait until all branches are created and not
|
||||
allow any branches to be created before the branch point on the ancestor branch
|
||||
while the operation is ongoing.
|
||||
|
||||
## WAL based disaster recovery
|
||||
|
||||
WAL based disaster recovery currently supports only restoring of the main
|
||||
branch. Enabling WAL based disaster recovery in the future requires that we
|
||||
keep a record which timeline generated the WAL and at which LSN was a parent
|
||||
detached. Keep a list of timeline ids and the LSN in which they were detached in
|
||||
the `index_part.json`. Limit the size of the list to 100 first entries, after
|
||||
which the WAL disaster recovery will not be possible.
|
||||
|
||||
## Sharded tenants
|
||||
|
||||
For sharded tenants, calls to the detach ancestor API will pass through the storage
|
||||
controller, which will handle them the same as timeline creations: invoke first
|
||||
on shard zero, and then on all the other shards.
|
||||
@@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration:
|
||||
- Use `diesel migration generate <name>` to create a new migration
|
||||
- Populate the SQL files in the `migrations/` subdirectory
|
||||
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
|
||||
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
|
||||
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
|
||||
- Commit the migration files and the changes to schema.rs
|
||||
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
|
||||
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
|
||||
|
||||
@@ -87,7 +87,7 @@ pub struct TenantLocateResponse {
|
||||
pub shard_params: ShardParameters,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantDescribeResponse {
|
||||
pub tenant_id: TenantId,
|
||||
pub shards: Vec<TenantDescribeResponseShard>,
|
||||
@@ -110,7 +110,7 @@ pub struct NodeDescribeResponse {
|
||||
pub listen_pg_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantDescribeResponseShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
|
||||
|
||||
@@ -651,6 +651,17 @@ pub struct TenantDetails {
|
||||
pub timelines: Vec<TimelineId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
|
||||
pub enum TimelineArchivalState {
|
||||
Archived,
|
||||
Unarchived,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
pub struct TimelineArchivalConfigRequest {
|
||||
pub state: TimelineArchivalState,
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use utils::id::TimelineId;
|
||||
|
||||
#[derive(Default, serde::Serialize)]
|
||||
#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct AncestorDetached {
|
||||
pub reparented_timelines: Vec<TimelineId>,
|
||||
}
|
||||
|
||||
@@ -443,7 +443,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
|
||||
pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
|
||||
let timeout = storage_config.timeout;
|
||||
Ok(match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs { local_path: path } => {
|
||||
@@ -458,7 +458,7 @@ impl GenericRemoteStorage {
|
||||
std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
|
||||
Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?))
|
||||
}
|
||||
RemoteStorageKind::AzureContainer(azure_config) => {
|
||||
let storage_account = azure_config
|
||||
|
||||
@@ -16,16 +16,10 @@ use std::{
|
||||
|
||||
use anyhow::{anyhow, Context as _};
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
meta::credentials::CredentialsProviderChain,
|
||||
profile::ProfileFileCredentialsProvider,
|
||||
provider_config::ProviderConfig,
|
||||
default_provider::credentials::DefaultCredentialsChain,
|
||||
retry::{RetryConfigBuilder, RetryMode},
|
||||
web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||
BehaviorVersion,
|
||||
};
|
||||
use aws_credential_types::provider::SharedCredentialsProvider;
|
||||
use aws_sdk_s3::{
|
||||
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
|
||||
error::SdkError,
|
||||
@@ -76,40 +70,27 @@ struct GetObjectRequest {
|
||||
}
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
|
||||
pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
|
||||
tracing::debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
remote_storage_config.bucket_name
|
||||
);
|
||||
|
||||
let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
|
||||
let region = Region::new(remote_storage_config.bucket_region.clone());
|
||||
let region_opt = Some(region.clone());
|
||||
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
|
||||
let credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
CredentialsProviderChain::first_try(
|
||||
"env",
|
||||
EnvironmentVariableCredentialsProvider::new(),
|
||||
)
|
||||
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
|
||||
.or_else(
|
||||
"profile-sso",
|
||||
ProfileFileCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||
// needed to access remote extensions bucket
|
||||
.or_else(
|
||||
"token",
|
||||
WebIdentityTokenCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
// uses imds v2
|
||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||
};
|
||||
// https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html
|
||||
// https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html
|
||||
// Incomplete list of auth methods used by this:
|
||||
// * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
// * "AWS_PROFILE" / `aws sso login --profile <profile>`
|
||||
// * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||
// * http (ECS/EKS) container credentials
|
||||
// * imds v2
|
||||
let credentials_provider = DefaultCredentialsChain::builder()
|
||||
.region(region)
|
||||
.build()
|
||||
.await;
|
||||
|
||||
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
|
||||
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
|
||||
@@ -118,9 +99,9 @@ impl S3Bucket {
|
||||
#[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
|
||||
BehaviorVersion::v2023_11_09(),
|
||||
)
|
||||
.region(region)
|
||||
.region(region_opt)
|
||||
.identity_cache(IdentityCache::lazy().build())
|
||||
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
|
||||
.credentials_provider(credentials_provider)
|
||||
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
|
||||
|
||||
let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
|
||||
@@ -1041,8 +1022,8 @@ mod tests {
|
||||
|
||||
use crate::{RemotePath, S3Bucket, S3Config};
|
||||
|
||||
#[test]
|
||||
fn relative_path() {
|
||||
#[tokio::test]
|
||||
async fn relative_path() {
|
||||
let all_paths = ["", "some/path", "some/path/"];
|
||||
let all_paths: Vec<RemotePath> = all_paths
|
||||
.iter()
|
||||
@@ -1085,8 +1066,9 @@ mod tests {
|
||||
max_keys_per_list_response: Some(5),
|
||||
upload_storage_class: None,
|
||||
};
|
||||
let storage =
|
||||
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
|
||||
let storage = S3Bucket::new(&config, std::time::Duration::ZERO)
|
||||
.await
|
||||
.expect("remote storage init");
|
||||
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
|
||||
let result = storage.relative_path_to_s3_object(test_path);
|
||||
let expected = expected_outputs[prefix_idx][test_path_idx];
|
||||
|
||||
@@ -31,6 +31,7 @@ struct EnabledAzure {
|
||||
impl EnabledAzure {
|
||||
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
|
||||
let client = create_azure_client(max_keys_in_list_response)
|
||||
.await
|
||||
.context("Azure client creation")
|
||||
.expect("Azure client creation failed");
|
||||
|
||||
@@ -187,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
fn create_azure_client(
|
||||
async fn create_azure_client(
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
use rand::Rng;
|
||||
@@ -221,6 +222,8 @@ fn create_azure_client(
|
||||
timeout: Duration::from_secs(120),
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
@@ -197,6 +197,7 @@ struct EnabledS3 {
|
||||
impl EnabledS3 {
|
||||
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
|
||||
let client = create_s3_client(max_keys_in_list_response)
|
||||
.await
|
||||
.context("S3 client creation")
|
||||
.expect("S3 client creation failed");
|
||||
|
||||
@@ -352,7 +353,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_client(
|
||||
async fn create_s3_client(
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
use rand::Rng;
|
||||
@@ -385,7 +386,9 @@ fn create_s3_client(
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
|
||||
@@ -33,6 +33,10 @@ pub enum Scope {
|
||||
GenerationsApi,
|
||||
// Allows access to control plane managment API and some storage controller endpoints.
|
||||
Admin,
|
||||
|
||||
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state
|
||||
/// of a tenant & post scrub results.
|
||||
Scrubber,
|
||||
}
|
||||
|
||||
/// JWT payload. See docs/authentication.md for the format
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use bytes::Bytes;
|
||||
use detach_ancestor::AncestorDetached;
|
||||
use pageserver_api::{models::*, shard::TenantShardId};
|
||||
use reqwest::{IntoUrl, Method, StatusCode};
|
||||
use utils::{
|
||||
@@ -418,6 +419,23 @@ impl Client {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn timeline_detach_ancestor(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<AncestorDetached> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
self.request(Method::PUT, &uri, ())
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/reset",
|
||||
|
||||
@@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.get("remote_storage")
|
||||
.expect("need remote_storage");
|
||||
let config = RemoteStorageConfig::from_toml(toml_item)?;
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
|
||||
let cancel = CancellationToken::new();
|
||||
storage
|
||||
.unwrap()
|
||||
|
||||
@@ -14,12 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
|
||||
}
|
||||
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
|
||||
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
|
||||
(Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Pageserver auth",
|
||||
claims.scope
|
||||
)
|
||||
.into(),
|
||||
)),
|
||||
(Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => {
|
||||
Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Pageserver auth",
|
||||
claims.scope
|
||||
)
|
||||
.into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -385,7 +385,7 @@ fn start_pageserver(
|
||||
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
|
||||
|
||||
// Set up remote storage client
|
||||
let remote_storage = create_remote_storage_client(conf)?;
|
||||
let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?;
|
||||
|
||||
// Set up deletion queue
|
||||
let (deletion_queue, deletion_workers) = DeletionQueue::new(
|
||||
@@ -622,7 +622,6 @@ fn start_pageserver(
|
||||
metric_collection_endpoint,
|
||||
&conf.metric_collection_bucket,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
local_disk_storage,
|
||||
@@ -702,7 +701,7 @@ fn start_pageserver(
|
||||
}
|
||||
}
|
||||
|
||||
fn create_remote_storage_client(
|
||||
async fn create_remote_storage_client(
|
||||
conf: &'static PageServerConf,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
let config = if let Some(config) = &conf.remote_storage_config {
|
||||
@@ -712,7 +711,7 @@ fn create_remote_storage_client(
|
||||
};
|
||||
|
||||
// Create the client
|
||||
let mut remote_storage = GenericRemoteStorage::from_config(config)?;
|
||||
let mut remote_storage = GenericRemoteStorage::from_config(config).await?;
|
||||
|
||||
// If `test_remote_failures` is non-zero, wrap the client with a
|
||||
// wrapper that simulates failures.
|
||||
|
||||
@@ -68,7 +68,6 @@ pub mod defaults {
|
||||
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
|
||||
|
||||
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
|
||||
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
|
||||
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
|
||||
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
|
||||
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
|
||||
@@ -123,7 +122,6 @@ pub mod defaults {
|
||||
#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
|
||||
|
||||
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
|
||||
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
|
||||
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
|
||||
|
||||
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
|
||||
@@ -238,7 +236,6 @@ pub struct PageServerConf {
|
||||
// How often to collect metrics and send them to the metrics endpoint.
|
||||
pub metric_collection_interval: Duration,
|
||||
// How often to send unchanged cached metrics to the metrics endpoint.
|
||||
pub cached_metric_collection_interval: Duration,
|
||||
pub metric_collection_endpoint: Option<Url>,
|
||||
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
@@ -370,7 +367,6 @@ struct PageServerConfigBuilder {
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
|
||||
|
||||
metric_collection_interval: BuilderValue<Duration>,
|
||||
cached_metric_collection_interval: BuilderValue<Duration>,
|
||||
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
||||
synthetic_size_calculation_interval: BuilderValue<Duration>,
|
||||
metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
|
||||
@@ -454,10 +450,6 @@ impl PageServerConfigBuilder {
|
||||
DEFAULT_METRIC_COLLECTION_INTERVAL,
|
||||
)
|
||||
.expect("cannot parse default metric collection interval")),
|
||||
cached_metric_collection_interval: Set(humantime::parse_duration(
|
||||
DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL,
|
||||
)
|
||||
.expect("cannot parse default cached_metric_collection_interval")),
|
||||
synthetic_size_calculation_interval: Set(humantime::parse_duration(
|
||||
DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
|
||||
)
|
||||
@@ -589,14 +581,6 @@ impl PageServerConfigBuilder {
|
||||
self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
|
||||
}
|
||||
|
||||
pub fn cached_metric_collection_interval(
|
||||
&mut self,
|
||||
cached_metric_collection_interval: Duration,
|
||||
) {
|
||||
self.cached_metric_collection_interval =
|
||||
BuilderValue::Set(cached_metric_collection_interval)
|
||||
}
|
||||
|
||||
pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
|
||||
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
|
||||
}
|
||||
@@ -730,7 +714,6 @@ impl PageServerConfigBuilder {
|
||||
broker_keepalive_interval,
|
||||
log_format,
|
||||
metric_collection_interval,
|
||||
cached_metric_collection_interval,
|
||||
metric_collection_endpoint,
|
||||
metric_collection_bucket,
|
||||
synthetic_size_calculation_interval,
|
||||
@@ -947,7 +930,6 @@ impl PageServerConf {
|
||||
NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
|
||||
}),
|
||||
"metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
|
||||
"cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
|
||||
"metric_collection_endpoint" => {
|
||||
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
|
||||
builder.metric_collection_endpoint(Some(endpoint));
|
||||
@@ -1080,7 +1062,6 @@ impl PageServerConf {
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
|
||||
),
|
||||
metric_collection_interval: Duration::from_secs(60),
|
||||
cached_metric_collection_interval: Duration::from_secs(60 * 60),
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: Duration::from_secs(60),
|
||||
@@ -1259,7 +1240,6 @@ initial_superuser_name = 'zzzz'
|
||||
id = 10
|
||||
|
||||
metric_collection_interval = '222 s'
|
||||
cached_metric_collection_interval = '22200 s'
|
||||
metric_collection_endpoint = 'http://localhost:80/metrics'
|
||||
synthetic_size_calculation_interval = '333 s'
|
||||
|
||||
@@ -1315,9 +1295,6 @@ background_task_maximum_delay = '334 s'
|
||||
metric_collection_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
|
||||
)?,
|
||||
cached_metric_collection_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
|
||||
)?,
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: humantime::parse_duration(
|
||||
@@ -1396,7 +1373,6 @@ background_task_maximum_delay = '334 s'
|
||||
eviction_task_immitated_concurrent_logical_size_queries:
|
||||
ConfigurableSemaphore::default(),
|
||||
metric_collection_interval: Duration::from_secs(222),
|
||||
cached_metric_collection_interval: Duration::from_secs(22200),
|
||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: Duration::from_secs(333),
|
||||
|
||||
@@ -46,19 +46,12 @@ pub async fn collect_metrics(
|
||||
metric_collection_endpoint: &Url,
|
||||
metric_collection_bucket: &Option<RemoteStorageConfig>,
|
||||
metric_collection_interval: Duration,
|
||||
_cached_metric_collection_interval: Duration,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
node_id: NodeId,
|
||||
local_disk_storage: Utf8PathBuf,
|
||||
cancel: CancellationToken,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
if _cached_metric_collection_interval != Duration::ZERO {
|
||||
tracing::warn!(
|
||||
"cached_metric_collection_interval is no longer used, please set it to zero."
|
||||
)
|
||||
}
|
||||
|
||||
// spin up background worker that caclulates tenant sizes
|
||||
let worker_ctx =
|
||||
ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
|
||||
@@ -103,7 +96,7 @@ pub async fn collect_metrics(
|
||||
.expect("Failed to create http client with timeout");
|
||||
|
||||
let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
|
||||
match GenericRemoteStorage::from_config(bucket_config) {
|
||||
match GenericRemoteStorage::from_config(bucket_config).await {
|
||||
Ok(client) => Some(client),
|
||||
Err(e) => {
|
||||
// Non-fatal error: if we were given an invalid config, we will proceed
|
||||
|
||||
@@ -828,9 +828,9 @@ mod test {
|
||||
}
|
||||
}
|
||||
|
||||
fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
|
||||
async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
|
||||
let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
let harness = TenantHarness::create(test_name).await?;
|
||||
|
||||
// We do not load() the harness: we only need its config and remote_storage
|
||||
|
||||
@@ -844,7 +844,9 @@ mod test {
|
||||
},
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mock_control_plane = MockControlPlane::new();
|
||||
|
||||
@@ -922,7 +924,9 @@ mod test {
|
||||
#[tokio::test]
|
||||
async fn deletion_queue_smoke() -> anyhow::Result<()> {
|
||||
// Basic test that the deletion queue processes the deletions we pass into it
|
||||
let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
|
||||
let ctx = setup("deletion_queue_smoke")
|
||||
.await
|
||||
.expect("Failed test setup");
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
client.recover(HashMap::new())?;
|
||||
|
||||
@@ -992,7 +996,9 @@ mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn deletion_queue_validation() -> anyhow::Result<()> {
|
||||
let ctx = setup("deletion_queue_validation").expect("Failed test setup");
|
||||
let ctx = setup("deletion_queue_validation")
|
||||
.await
|
||||
.expect("Failed test setup");
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
client.recover(HashMap::new())?;
|
||||
|
||||
@@ -1051,7 +1057,9 @@ mod test {
|
||||
#[tokio::test]
|
||||
async fn deletion_queue_recovery() -> anyhow::Result<()> {
|
||||
// Basic test that the deletion queue processes the deletions we pass into it
|
||||
let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
|
||||
let mut ctx = setup("deletion_queue_recovery")
|
||||
.await
|
||||
.expect("Failed test setup");
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
client.recover(HashMap::new())?;
|
||||
|
||||
|
||||
@@ -377,7 +377,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
|
||||
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
@@ -397,6 +397,51 @@ paths:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
put:
|
||||
description: |
|
||||
Either archives or unarchives the given timeline.
|
||||
An archived timeline may not have any non-archived children.
|
||||
requestBody:
|
||||
required: false
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ArchivalConfigRequest"
|
||||
responses:
|
||||
"200":
|
||||
description: Timeline (un)archived successfully
|
||||
"409":
|
||||
description: |
|
||||
The tenant/timeline is already being modified, perhaps by a concurrent call to this API
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/synthetic_size:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -429,7 +474,9 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SyntheticSizeResponse"
|
||||
text/html:
|
||||
description: SVG representation of the tenant and it's timelines.
|
||||
schema:
|
||||
type: string
|
||||
description: SVG representation of the tenant and its timelines.
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
@@ -568,7 +615,7 @@ paths:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
ŕequired: true
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
|
||||
@@ -774,15 +821,13 @@ components:
|
||||
TenantCreateRequest:
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/TenantConfig'
|
||||
- $ref: '#/components/schemas/TenantLoadRequest'
|
||||
- type: object
|
||||
required:
|
||||
- new_tenant_id
|
||||
properties:
|
||||
new_tenant_id:
|
||||
type: string
|
||||
generation:
|
||||
type: integer
|
||||
description: Attachment generation number.
|
||||
TenantLoadRequest:
|
||||
type: object
|
||||
properties:
|
||||
@@ -846,6 +891,15 @@ components:
|
||||
warm:
|
||||
type: boolean
|
||||
description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything.
|
||||
ArchivalConfigRequest:
|
||||
type: object
|
||||
required
|
||||
- state
|
||||
properties:
|
||||
state:
|
||||
description: The archival state of a timeline
|
||||
type: string
|
||||
enum: ["Archived", "Unarchived"]
|
||||
TenantConfig:
|
||||
type: object
|
||||
properties:
|
||||
@@ -1106,7 +1160,7 @@ components:
|
||||
reparented_timelines:
|
||||
type: array
|
||||
description: Set of reparented timeline ids
|
||||
properties:
|
||||
items:
|
||||
type: string
|
||||
format: hex
|
||||
description: TimelineId
|
||||
|
||||
@@ -18,14 +18,17 @@ use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
||||
use pageserver_api::models::IngestAuxFilesRequest;
|
||||
use pageserver_api::models::ListAuxFilesRequest;
|
||||
use pageserver_api::models::LocationConfig;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::LsnLeaseRequest;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantLocationConfigRequest;
|
||||
use pageserver_api::models::TenantLocationConfigResponse;
|
||||
use pageserver_api::models::TenantScanRemoteStorageResponse;
|
||||
use pageserver_api::models::TenantScanRemoteStorageShard;
|
||||
@@ -33,12 +36,10 @@ use pageserver_api::models::TenantShardLocation;
|
||||
use pageserver_api::models::TenantShardSplitRequest;
|
||||
use pageserver_api::models::TenantShardSplitResponse;
|
||||
use pageserver_api::models::TenantSorting;
|
||||
use pageserver_api::models::TimelineArchivalConfigRequest;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
use pageserver_api::models::TopTenantShardsRequest;
|
||||
use pageserver_api::models::TopTenantShardsResponse;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
|
||||
};
|
||||
use pageserver_api::shard::ShardCount;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
@@ -664,6 +665,39 @@ async fn timeline_preserve_initdb_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn timeline_archival_config_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
tenant
|
||||
.apply_timeline_archival_config(timeline_id, request_data.state)
|
||||
.await
|
||||
.context("applying archival config")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
Ok::<_, ApiError>(())
|
||||
}
|
||||
.instrument(info_span!("timeline_archival_config",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
state = ?request_data.state,
|
||||
%timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn timeline_detail_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1721,7 +1755,9 @@ async fn timeline_detach_ancestor_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
use crate::tenant::timeline::detach_ancestor::Options;
|
||||
use crate::tenant::timeline::detach_ancestor;
|
||||
use pageserver_api::models::detach_ancestor::AncestorDetached;
|
||||
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -1729,7 +1765,7 @@ async fn timeline_detach_ancestor_handler(
|
||||
let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
|
||||
|
||||
async move {
|
||||
let mut options = Options::default();
|
||||
let mut options = detach_ancestor::Options::default();
|
||||
|
||||
let rewrite_concurrency =
|
||||
parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
|
||||
@@ -1757,27 +1793,36 @@ async fn timeline_detach_ancestor_handler(
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let (_guard, prepared) = timeline
|
||||
let progress = timeline
|
||||
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
|
||||
.await?;
|
||||
|
||||
let res = state
|
||||
.tenant_manager
|
||||
.complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
|
||||
.await;
|
||||
// uncomment to allow early as possible Tenant::drop
|
||||
// drop(tenant);
|
||||
|
||||
match res {
|
||||
Ok(reparented_timelines) => {
|
||||
let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
|
||||
let resp = match progress {
|
||||
detach_ancestor::Progress::Prepared(_guard, prepared) => {
|
||||
// it would be great to tag the guard on to the tenant activation future
|
||||
let reparented_timelines = state
|
||||
.tenant_manager
|
||||
.complete_detaching_timeline_ancestor(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
prepared,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.context("timeline detach ancestor completion")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
AncestorDetached {
|
||||
reparented_timelines,
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
}
|
||||
Err(e) => Err(ApiError::InternalServerError(
|
||||
e.context("timeline detach completion"),
|
||||
)),
|
||||
}
|
||||
detach_ancestor::Progress::Done(resp) => resp,
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
@@ -2778,6 +2823,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|
||||
|r| api_handler(r, timeline_preserve_initdb_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
|
||||
|r| api_handler(r, timeline_archival_config_handler),
|
||||
)
|
||||
.get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
||||
api_handler(r, timeline_detail_handler)
|
||||
})
|
||||
|
||||
@@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum MetricLayerKind {
|
||||
Delta,
|
||||
Image,
|
||||
}
|
||||
|
||||
static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_layer_bytes",
|
||||
"Sum of layer physical sizes in bytes",
|
||||
&["tenant_id", "shard_id", "timeline_id", "kind"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_layer_count",
|
||||
"Number of layers that exist",
|
||||
&["tenant_id", "shard_id", "timeline_id", "kind"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_archive_size",
|
||||
@@ -585,6 +610,22 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_compression_image_in_bytes_total",
|
||||
"Size of uncompressed data written into image layers"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_compression_image_out_bytes_total",
|
||||
"Size of compressed image layer written"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) mod initial_logical_size {
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -1490,7 +1531,6 @@ pub(crate) enum ComputeCommandKind {
|
||||
Basebackup,
|
||||
Fullbackup,
|
||||
LeaseLsn,
|
||||
Show,
|
||||
}
|
||||
|
||||
pub(crate) struct ComputeCommandCounters {
|
||||
@@ -2142,6 +2182,10 @@ pub(crate) struct TimelineMetrics {
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub pitr_history_size: UIntGauge,
|
||||
pub archival_size: UIntGauge,
|
||||
pub(crate) layer_size_image: UIntGauge,
|
||||
pub(crate) layer_count_image: UIntGauge,
|
||||
pub(crate) layer_size_delta: UIntGauge,
|
||||
pub(crate) layer_count_delta: UIntGauge,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
@@ -2224,6 +2268,42 @@ impl TimelineMetrics {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let layer_size_image = TIMELINE_LAYER_SIZE
|
||||
.get_metric_with_label_values(&[
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
MetricLayerKind::Image.into(),
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
let layer_count_image = TIMELINE_LAYER_COUNT
|
||||
.get_metric_with_label_values(&[
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
MetricLayerKind::Image.into(),
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
let layer_size_delta = TIMELINE_LAYER_SIZE
|
||||
.get_metric_with_label_values(&[
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
MetricLayerKind::Delta.into(),
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
let layer_count_delta = TIMELINE_LAYER_COUNT
|
||||
.get_metric_with_label_values(&[
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
MetricLayerKind::Delta.into(),
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
let standby_horizon_gauge = STANDBY_HORIZON
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -2278,6 +2358,10 @@ impl TimelineMetrics {
|
||||
last_record_gauge,
|
||||
pitr_history_size,
|
||||
archival_size,
|
||||
layer_size_image,
|
||||
layer_count_image,
|
||||
layer_size_delta,
|
||||
layer_count_delta,
|
||||
standby_horizon_gauge,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
@@ -2339,6 +2423,31 @@ impl TimelineMetrics {
|
||||
let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
MetricLayerKind::Image.into(),
|
||||
]);
|
||||
let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
MetricLayerKind::Image.into(),
|
||||
]);
|
||||
let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
MetricLayerKind::Delta.into(),
|
||||
]);
|
||||
let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
MetricLayerKind::Delta.into(),
|
||||
]);
|
||||
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
@@ -1479,66 +1479,6 @@ where
|
||||
))?
|
||||
}
|
||||
};
|
||||
} else if let Some(params) = parts.strip_prefix(&["show"]) {
|
||||
// show <tenant_id>
|
||||
if params.len() != 1 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for config command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
|
||||
tracing::Span::current().record("tenant_id", field::display(tenant_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Show)
|
||||
.inc();
|
||||
|
||||
let tenant = self
|
||||
.get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::Zero,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
||||
RowDescriptor::int8_col(b"compaction_target_size"),
|
||||
RowDescriptor::int8_col(b"compaction_period"),
|
||||
RowDescriptor::int8_col(b"compaction_threshold"),
|
||||
RowDescriptor::int8_col(b"gc_horizon"),
|
||||
RowDescriptor::int8_col(b"gc_period"),
|
||||
RowDescriptor::int8_col(b"image_creation_threshold"),
|
||||
RowDescriptor::int8_col(b"pitr_interval"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
|
||||
Some(
|
||||
tenant
|
||||
.get_checkpoint_timeout()
|
||||
.as_secs()
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(tenant.get_compaction_target_size().to_string().as_bytes()),
|
||||
Some(
|
||||
tenant
|
||||
.get_compaction_period()
|
||||
.as_secs()
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(tenant.get_compaction_threshold().to_string().as_bytes()),
|
||||
Some(tenant.get_gc_horizon().to_string().as_bytes()),
|
||||
Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
|
||||
Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
|
||||
Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unknown command {query_string}"
|
||||
|
||||
@@ -2031,7 +2031,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn aux_files_round_trip() -> anyhow::Result<()> {
|
||||
let name = "aux_files_round_trip";
|
||||
let harness = TenantHarness::create(name)?;
|
||||
let harness = TenantHarness::create(name).await?;
|
||||
|
||||
pub const TIMELINE_ID: TimelineId =
|
||||
TimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
|
||||
@@ -21,6 +21,7 @@ use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
use pageserver_api::models::WalRedoManagerStatus;
|
||||
@@ -1228,6 +1229,14 @@ impl Tenant {
|
||||
Ok(timeline_preloads)
|
||||
}
|
||||
|
||||
pub async fn apply_timeline_archival_config(
|
||||
&self,
|
||||
_timeline_id: TimelineId,
|
||||
_config: TimelineArchivalState,
|
||||
) -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
|
||||
self.tenant_shard_id
|
||||
}
|
||||
@@ -2912,7 +2921,7 @@ impl Tenant {
|
||||
if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
|
||||
if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
|
||||
target.within_ancestor_pitr =
|
||||
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
|
||||
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2928,7 +2937,7 @@ impl Tenant {
|
||||
timeline.metrics.pitr_history_size.set(
|
||||
timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(target.cutoffs.pitr)
|
||||
.checked_sub(target.cutoffs.time)
|
||||
.unwrap_or(Lsn(0))
|
||||
.0,
|
||||
);
|
||||
@@ -3788,7 +3797,7 @@ pub(crate) mod harness {
|
||||
}
|
||||
|
||||
impl TenantHarness {
|
||||
pub fn create_custom(
|
||||
pub async fn create_custom(
|
||||
test_name: &'static str,
|
||||
tenant_conf: TenantConf,
|
||||
tenant_id: TenantId,
|
||||
@@ -3824,7 +3833,7 @@ pub(crate) mod harness {
|
||||
},
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
|
||||
let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
|
||||
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
|
||||
|
||||
Ok(Self {
|
||||
@@ -3839,7 +3848,7 @@ pub(crate) mod harness {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
pub async fn create(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
// Disable automatic GC and compaction to make the unit tests more deterministic.
|
||||
// The tests perform them manually if needed.
|
||||
let tenant_conf = TenantConf {
|
||||
@@ -3856,6 +3865,7 @@ pub(crate) mod harness {
|
||||
shard,
|
||||
Generation::new(0xdeadbeef),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn span(&self) -> tracing::Span {
|
||||
@@ -3992,7 +4002,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_basic() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -4039,7 +4049,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_duplicate_timelines() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
|
||||
let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let _ = tenant
|
||||
@@ -4071,7 +4082,7 @@ mod tests {
|
||||
async fn test_branch() -> anyhow::Result<()> {
|
||||
use std::str::from_utf8;
|
||||
|
||||
let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -4193,7 +4204,8 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
@@ -4240,7 +4252,8 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
|
||||
@@ -4262,7 +4275,7 @@ mod tests {
|
||||
.source()
|
||||
.unwrap()
|
||||
.to_string()
|
||||
.contains("is earlier than latest GC horizon"));
|
||||
.contains("is earlier than latest GC cutoff"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4295,7 +4308,8 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?
|
||||
TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
@@ -4352,7 +4366,8 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
|
||||
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
@@ -4382,10 +4397,10 @@ mod tests {
|
||||
}
|
||||
#[tokio::test]
|
||||
async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
|
||||
.load()
|
||||
.await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -4423,7 +4438,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn timeline_load() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
let harness = TenantHarness::create(TEST_NAME).await?;
|
||||
{
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
@@ -4450,7 +4465,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load_with_ancestor";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
let harness = TenantHarness::create(TEST_NAME).await?;
|
||||
// create two timelines
|
||||
{
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
@@ -4498,7 +4513,10 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn delta_layer_dumping() -> anyhow::Result<()> {
|
||||
use storage_layer::AsLayerDesc;
|
||||
let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_layer_dumping")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -4525,7 +4543,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_images() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -4696,7 +4714,7 @@ mod tests {
|
||||
//
|
||||
#[tokio::test]
|
||||
async fn test_bulk_insert() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_bulk_insert")?;
|
||||
let harness = TenantHarness::create("test_bulk_insert").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -4727,7 +4745,7 @@ mod tests {
|
||||
// so the search can stop at the first delta layer and doesn't traverse any deeper.
|
||||
#[tokio::test]
|
||||
async fn test_get_vectored() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_get_vectored")?;
|
||||
let harness = TenantHarness::create("test_get_vectored").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -4805,7 +4823,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_get_vectored_aux_files")?;
|
||||
let harness = TenantHarness::create("test_get_vectored_aux_files").await?;
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
@@ -4891,7 +4909,8 @@ mod tests {
|
||||
TenantId::generate(),
|
||||
ShardIdentity::unsharded(),
|
||||
Generation::new(0xdeadbeef),
|
||||
)?;
|
||||
)
|
||||
.await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
@@ -5034,7 +5053,7 @@ mod tests {
|
||||
// ```
|
||||
#[tokio::test]
|
||||
async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
|
||||
let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
@@ -5183,7 +5202,7 @@ mod tests {
|
||||
name: &'static str,
|
||||
compaction_algorithm: CompactionAlgorithm,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut harness = TenantHarness::create(name)?;
|
||||
let mut harness = TenantHarness::create(name).await?;
|
||||
harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
|
||||
kind: compaction_algorithm,
|
||||
};
|
||||
@@ -5267,7 +5286,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_traverse_branches() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
|
||||
let (tenant, ctx) = TenantHarness::create("test_traverse_branches")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let mut tline = tenant
|
||||
@@ -5357,7 +5377,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_traverse_ancestors() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
|
||||
let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let mut tline = tenant
|
||||
@@ -5423,7 +5444,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")?
|
||||
let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
|
||||
@@ -5492,7 +5514,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_create_guard_crash() -> anyhow::Result<()> {
|
||||
let name = "test_create_guard_crash";
|
||||
let harness = TenantHarness::create(name)?;
|
||||
let harness = TenantHarness::create(name).await?;
|
||||
{
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
@@ -5545,7 +5567,7 @@ mod tests {
|
||||
name: &'static str,
|
||||
compaction_algorithm: CompactionAlgorithm,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut harness = TenantHarness::create(name)?;
|
||||
let mut harness = TenantHarness::create(name).await?;
|
||||
harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
|
||||
kind: compaction_algorithm,
|
||||
};
|
||||
@@ -5569,7 +5591,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_scan() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_scan")?;
|
||||
let harness = TenantHarness::create("test_metadata_scan").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -5688,7 +5710,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
|
||||
let harness = TenantHarness::create("test_metadata_compaction_trigger").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -5747,7 +5769,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_branch_copies_dirty_aux_file_flag() {
|
||||
let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
|
||||
let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// the default aux file policy to switch is v1 if not set by the admins
|
||||
assert_eq!(
|
||||
@@ -5849,7 +5873,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_switch() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
|
||||
let mut harness = TenantHarness::create("aux_file_policy_switch")
|
||||
.await
|
||||
.unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
@@ -6023,7 +6049,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_force_switch() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
|
||||
let mut harness = TenantHarness::create("aux_file_policy_force_switch")
|
||||
.await
|
||||
.unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
@@ -6084,7 +6112,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_auto_detect() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
|
||||
let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
|
||||
.await
|
||||
.unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
@@ -6147,7 +6177,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_image_creation() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_image_creation")?;
|
||||
let harness = TenantHarness::create("test_metadata_image_creation").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -6246,7 +6276,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
|
||||
let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
@@ -6318,7 +6348,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
|
||||
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
@@ -6410,7 +6440,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_reads").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
@@ -6490,7 +6520,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_image_creation() {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_image_creation")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
@@ -6562,8 +6594,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_empty_image_creation() {
|
||||
let harness =
|
||||
TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
@@ -6626,7 +6659,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
@@ -6718,8 +6751,8 @@ mod tests {
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.pitr = Lsn(0x30);
|
||||
guard.cutoffs.horizon = Lsn(0x30);
|
||||
guard.cutoffs.time = Lsn(0x30);
|
||||
guard.cutoffs.space = Lsn(0x30);
|
||||
}
|
||||
|
||||
let expected_result = [
|
||||
@@ -6810,7 +6843,7 @@ mod tests {
|
||||
vec![
|
||||
// Image layer at GC horizon
|
||||
PersistentLayerKey {
|
||||
key_range: Key::MIN..get_key(10),
|
||||
key_range: Key::MIN..Key::MAX,
|
||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||
is_delta: false
|
||||
},
|
||||
@@ -6834,7 +6867,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_neon_test_record() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_neon_test_record")?;
|
||||
let harness = TenantHarness::create("test_neon_test_record").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
@@ -6915,7 +6948,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_lsn_lease() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
|
||||
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
|
||||
let end_lsn = Lsn(0x100);
|
||||
@@ -7004,7 +7037,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas")?;
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
@@ -7109,8 +7142,8 @@ mod tests {
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![],
|
||||
cutoffs: GcCutoffs {
|
||||
pitr: Lsn(0x30),
|
||||
horizon: Lsn(0x30),
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
|
||||
@@ -262,7 +262,7 @@ where
|
||||
|
||||
pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
|
||||
where
|
||||
R: 'a,
|
||||
R: 'a + Send,
|
||||
{
|
||||
DiskBtreeIterator {
|
||||
stream: Box::pin(self.into_stream(start_key, ctx)),
|
||||
@@ -521,7 +521,7 @@ where
|
||||
pub struct DiskBtreeIterator<'a> {
|
||||
#[allow(clippy::type_complexity)]
|
||||
stream: std::pin::Pin<
|
||||
Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
|
||||
Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a + Send>,
|
||||
>,
|
||||
}
|
||||
|
||||
|
||||
@@ -2698,7 +2698,9 @@ mod tests {
|
||||
// Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
|
||||
// wait for it to complete before proceeding.
|
||||
|
||||
let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
|
||||
let h = TenantHarness::create("shutdown_awaits_in_progress_tenant")
|
||||
.await
|
||||
.unwrap();
|
||||
let (t, _ctx) = h.load().await;
|
||||
|
||||
// harness loads it to active, which is forced and nothing is running on the tenant
|
||||
|
||||
@@ -241,7 +241,7 @@ use self::index::IndexPart;
|
||||
|
||||
use super::metadata::MetadataUpdate;
|
||||
use super::storage_layer::{Layer, LayerName, ResidentLayer};
|
||||
use super::upload_queue::SetDeletedFlagProgress;
|
||||
use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
|
||||
use super::Generation;
|
||||
|
||||
pub(crate) use download::{
|
||||
@@ -1930,6 +1930,31 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue
|
||||
/// externally to RemoteTimelineClient.
|
||||
pub(crate) fn initialized_upload_queue(
|
||||
&self,
|
||||
) -> Result<UploadQueueAccessor<'_>, NotInitialized> {
|
||||
let mut inner = self.upload_queue.lock().unwrap();
|
||||
inner.initialized_mut()?;
|
||||
Ok(UploadQueueAccessor { inner })
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct UploadQueueAccessor<'a> {
|
||||
inner: std::sync::MutexGuard<'a, UploadQueue>,
|
||||
}
|
||||
|
||||
impl<'a> UploadQueueAccessor<'a> {
|
||||
pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
|
||||
match &*self.inner {
|
||||
UploadQueue::Initialized(x) => &x.clean.0,
|
||||
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
|
||||
unreachable!("checked before constructing")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||
@@ -2103,7 +2128,7 @@ mod tests {
|
||||
impl TestSetup {
|
||||
async fn new(test_name: &str) -> anyhow::Result<Self> {
|
||||
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
let harness = TenantHarness::create(test_name).await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let timeline = tenant
|
||||
|
||||
@@ -176,6 +176,24 @@ pub(crate) struct Lineage {
|
||||
///
|
||||
/// If you are adding support for detaching from a hierarchy, consider changing the ancestry
|
||||
/// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
|
||||
// FIXME: this is insufficient even for path of two timelines for future wal recovery
|
||||
// purposes:
|
||||
//
|
||||
// assuming a "old main" which has received most of the WAL, and has a branch "new main",
|
||||
// starting a bit before "old main" last_record_lsn. the current version works fine,
|
||||
// because we will know to replay wal and branch at the recorded Lsn to do wal recovery.
|
||||
//
|
||||
// then assuming "new main" would similarly receive a branch right before its last_record_lsn,
|
||||
// "new new main". the current implementation would just store ("new main", ancestor_lsn, _)
|
||||
// here. however, we cannot recover from WAL using only that information, we would need the
|
||||
// whole ancestry here:
|
||||
//
|
||||
// ```json
|
||||
// [
|
||||
// ["old main", ancestor_lsn("new main"), _],
|
||||
// ["new main", ancestor_lsn("new new main"), _]
|
||||
// ]
|
||||
// ```
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
|
||||
}
|
||||
@@ -217,6 +235,14 @@ impl Lineage {
|
||||
self.original_ancestor
|
||||
.is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
|
||||
}
|
||||
|
||||
pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
|
||||
self.original_ancestor.is_some()
|
||||
}
|
||||
|
||||
pub(crate) fn is_reparented(&self) -> bool {
|
||||
!self.reparenting_history.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -135,11 +135,9 @@ pub struct TimelineInputs {
|
||||
ancestor_lsn: Lsn,
|
||||
last_record: Lsn,
|
||||
latest_gc_cutoff: Lsn,
|
||||
horizon_cutoff: Lsn,
|
||||
pitr_cutoff: Lsn,
|
||||
|
||||
/// Cutoff point based on GC settings
|
||||
next_gc_cutoff: Lsn,
|
||||
next_pitr_cutoff: Lsn,
|
||||
|
||||
/// Cutoff point calculated from the user-supplied 'max_retention_period'
|
||||
retention_param_cutoff: Option<Lsn>,
|
||||
@@ -150,7 +148,7 @@ pub struct TimelineInputs {
|
||||
|
||||
/// Gathers the inputs for the tenant sizing model.
|
||||
///
|
||||
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
||||
/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which
|
||||
/// is updated on-demand, during the start of this calculation and separate from the
|
||||
/// [`TimelineInputs::latest_gc_cutoff`].
|
||||
///
|
||||
@@ -158,11 +156,8 @@ pub struct TimelineInputs {
|
||||
///
|
||||
/// ```text
|
||||
/// 0-----|---------|----|------------| · · · · · |·> lsn
|
||||
/// initdb_lsn branchpoints* next_gc_cutoff latest
|
||||
/// initdb_lsn branchpoints* next_pitr_cutoff latest
|
||||
/// ```
|
||||
///
|
||||
/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
|
||||
/// tenant size will be zero.
|
||||
pub(super) async fn gather_inputs(
|
||||
tenant: &Tenant,
|
||||
limit: &Arc<Semaphore>,
|
||||
@@ -172,7 +167,7 @@ pub(super) async fn gather_inputs(
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ModelInputs, CalculateSyntheticSizeError> {
|
||||
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
||||
// refresh is needed to update [`timeline::GcCutoffs`]
|
||||
tenant.refresh_gc_info(cancel, ctx).await?;
|
||||
|
||||
// Collect information about all the timelines
|
||||
@@ -236,20 +231,18 @@ pub(super) async fn gather_inputs(
|
||||
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
|
||||
// actually removing files.
|
||||
//
|
||||
// We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
|
||||
// We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from
|
||||
// a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
|
||||
// than a space bound (horizon cutoff). This means that if someone drops a database and waits for their
|
||||
// than our internal space cutoff. This means that if someone drops a database and waits for their
|
||||
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
|
||||
// horizon_cutoff.
|
||||
let pitr_cutoff = gc_info.cutoffs.pitr;
|
||||
let horizon_cutoff = gc_info.cutoffs.horizon;
|
||||
let mut next_gc_cutoff = pitr_cutoff;
|
||||
// the space cutoff.
|
||||
let mut next_pitr_cutoff = gc_info.cutoffs.time;
|
||||
|
||||
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
|
||||
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
|
||||
let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
|
||||
if next_gc_cutoff < param_cutoff {
|
||||
next_gc_cutoff = param_cutoff;
|
||||
if next_pitr_cutoff < param_cutoff {
|
||||
next_pitr_cutoff = param_cutoff;
|
||||
}
|
||||
Some(param_cutoff)
|
||||
} else {
|
||||
@@ -263,7 +256,7 @@ pub(super) async fn gather_inputs(
|
||||
.copied()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
|
||||
// next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we
|
||||
// want to query any logical size before initdb_lsn.
|
||||
let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
|
||||
|
||||
@@ -291,10 +284,10 @@ pub(super) async fn gather_inputs(
|
||||
)
|
||||
}
|
||||
|
||||
// Add a point for the GC cutoff
|
||||
let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
|
||||
// Add a point for the PITR cutoff
|
||||
let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
|
||||
if !branch_start_needed {
|
||||
lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
|
||||
lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
|
||||
}
|
||||
|
||||
lsns.sort_unstable();
|
||||
@@ -333,7 +326,7 @@ pub(super) async fn gather_inputs(
|
||||
parent: Some(parent),
|
||||
lsn: lsn.0,
|
||||
size: None,
|
||||
needed: lsn > next_gc_cutoff,
|
||||
needed: lsn > next_pitr_cutoff,
|
||||
},
|
||||
timeline_id: timeline.timeline_id,
|
||||
kind,
|
||||
@@ -357,8 +350,8 @@ pub(super) async fn gather_inputs(
|
||||
segment: Segment {
|
||||
parent: Some(lease_parent),
|
||||
lsn: lsn.0,
|
||||
size: None, // Filled in later, if necessary
|
||||
needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
|
||||
size: None, // Filled in later, if necessary
|
||||
needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention.
|
||||
},
|
||||
timeline_id: timeline.timeline_id,
|
||||
kind: LsnKind::LeaseStart,
|
||||
@@ -398,9 +391,7 @@ pub(super) async fn gather_inputs(
|
||||
last_record: last_record_lsn,
|
||||
// this is not used above, because it might not have updated recently enough
|
||||
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
next_gc_cutoff,
|
||||
next_pitr_cutoff,
|
||||
retention_param_cutoff,
|
||||
lease_points,
|
||||
});
|
||||
@@ -742,9 +733,7 @@ fn verify_size_for_multiple_branches() {
|
||||
"ancestor_lsn": "0/18D3D98",
|
||||
"last_record": "0/2230CD0",
|
||||
"latest_gc_cutoff": "0/1698C48",
|
||||
"horizon_cutoff": "0/2210CD0",
|
||||
"pitr_cutoff": "0/2210CD0",
|
||||
"next_gc_cutoff": "0/2210CD0",
|
||||
"next_pitr_cutoff": "0/2210CD0",
|
||||
"retention_param_cutoff": null,
|
||||
"lease_points": []
|
||||
},
|
||||
@@ -753,9 +742,7 @@ fn verify_size_for_multiple_branches() {
|
||||
"ancestor_lsn": "0/176D998",
|
||||
"last_record": "0/1837770",
|
||||
"latest_gc_cutoff": "0/1698C48",
|
||||
"horizon_cutoff": "0/1817770",
|
||||
"pitr_cutoff": "0/1817770",
|
||||
"next_gc_cutoff": "0/1817770",
|
||||
"next_pitr_cutoff": "0/1817770",
|
||||
"retention_param_cutoff": null,
|
||||
"lease_points": []
|
||||
},
|
||||
@@ -764,9 +751,7 @@ fn verify_size_for_multiple_branches() {
|
||||
"ancestor_lsn": "0/0",
|
||||
"last_record": "0/18D3D98",
|
||||
"latest_gc_cutoff": "0/1698C48",
|
||||
"horizon_cutoff": "0/18B3D98",
|
||||
"pitr_cutoff": "0/18B3D98",
|
||||
"next_gc_cutoff": "0/18B3D98",
|
||||
"next_pitr_cutoff": "0/18B3D98",
|
||||
"retention_param_cutoff": null,
|
||||
"lease_points": []
|
||||
}
|
||||
@@ -820,9 +805,7 @@ fn verify_size_for_one_branch() {
|
||||
"ancestor_lsn": "0/0",
|
||||
"last_record": "47/280A5860",
|
||||
"latest_gc_cutoff": "47/240A5860",
|
||||
"horizon_cutoff": "47/240A5860",
|
||||
"pitr_cutoff": "47/240A5860",
|
||||
"next_gc_cutoff": "47/240A5860",
|
||||
"next_pitr_cutoff": "47/240A5860",
|
||||
"retention_param_cutoff": "0/0",
|
||||
"lease_points": []
|
||||
}
|
||||
|
||||
@@ -6,8 +6,6 @@ pub(crate) mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
mod layer_name;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod merge_iterator;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
@@ -676,6 +674,26 @@ impl LayerAccessStats {
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
|
||||
///
|
||||
/// This indicates whether the layer has been used for some purpose that would motivate
|
||||
/// us to keep it on disk, such as for serving a getpage request.
|
||||
fn accessed(&self) -> bool {
|
||||
let locked = self.0.lock().unwrap();
|
||||
let inner = &locked.for_eviction_policy;
|
||||
|
||||
// Consider it accessed if the most recent access is more recent than
|
||||
// the most recent change in residence status.
|
||||
match (
|
||||
inner.last_accesses.recent(),
|
||||
inner.last_residence_changes.recent(),
|
||||
) {
|
||||
(None, _) => false,
|
||||
(Some(_), None) => true,
|
||||
(Some(a), Some(r)) => a.when >= r.timestamp,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
|
||||
@@ -33,11 +33,14 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
@@ -53,6 +56,7 @@ use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
@@ -747,12 +751,10 @@ impl DeltaLayer {
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn key_range(&self) -> &Range<Key> {
|
||||
&self.layer_key_range
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.layer_lsn_range
|
||||
}
|
||||
@@ -1180,9 +1182,7 @@ impl DeltaLayerInner {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let val_ref = ValueRef {
|
||||
blob_ref: BlobRef(value),
|
||||
reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
|
||||
Adapter(self),
|
||||
)),
|
||||
layer: self,
|
||||
};
|
||||
let pos = BlobRef(value).pos();
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
@@ -1426,7 +1426,7 @@ impl DeltaLayerInner {
|
||||
let keys = self.load_keys(ctx).await?;
|
||||
|
||||
async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
|
||||
let buf = val.load_raw(ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
@@ -1461,8 +1461,7 @@ impl DeltaLayerInner {
|
||||
use pageserver_api::key::CHECKPOINT_KEY;
|
||||
use postgres_ffi::CheckPoint;
|
||||
if key == CHECKPOINT_KEY {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let val = val.load(ctx).await?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
let checkpoint = CheckPoint::decode(&img)?;
|
||||
@@ -1515,7 +1514,6 @@ impl DeltaLayerInner {
|
||||
offset
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
@@ -1526,7 +1524,7 @@ impl DeltaLayerInner {
|
||||
index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
|
||||
key_values_batch: std::collections::VecDeque::new(),
|
||||
is_end: false,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
|
||||
planner: StreamingVectoredReadPlanner::new(
|
||||
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
|
||||
1024, // The default value. Unit tests might use a different value
|
||||
),
|
||||
@@ -1547,17 +1545,24 @@ pub struct DeltaEntry<'a> {
|
||||
/// Reference to an on-disk value
|
||||
pub struct ValueRef<'a> {
|
||||
blob_ref: BlobRef,
|
||||
reader: BlockCursor<'a>,
|
||||
layer: &'a DeltaLayerInner,
|
||||
}
|
||||
|
||||
impl<'a> ValueRef<'a> {
|
||||
/// Loads the value from disk
|
||||
pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
|
||||
// theoretically we *could* record an access time for each, but it does not really matter
|
||||
let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
|
||||
let buf = self.load_raw(ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
Ok(val)
|
||||
}
|
||||
|
||||
async fn load_raw(&self, ctx: &RequestContext) -> Result<Vec<u8>> {
|
||||
let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter(
|
||||
self.layer,
|
||||
)));
|
||||
let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct Adapter<T>(T);
|
||||
@@ -1591,17 +1596,15 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub struct DeltaLayerIterator<'a> {
|
||||
delta_layer: &'a DeltaLayerInner,
|
||||
ctx: &'a RequestContext,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
|
||||
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
|
||||
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
|
||||
planner: StreamingVectoredReadPlanner,
|
||||
index_iter: DiskBtreeIterator<'a>,
|
||||
key_values_batch: VecDeque<(Key, Lsn, Value)>,
|
||||
is_end: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> DeltaLayerIterator<'a> {
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
@@ -1668,6 +1671,7 @@ pub(crate) mod test {
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::repository::Value;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
use crate::tenant::Tenant;
|
||||
@@ -1677,6 +1681,7 @@ pub(crate) mod test {
|
||||
tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
use bytes::Bytes;
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
/// traverse in order to plan vectored reads for a query. Finally,
|
||||
@@ -1929,7 +1934,7 @@ pub(crate) mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
|
||||
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
@@ -2029,7 +2034,9 @@ pub(crate) mod test {
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
|
||||
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
|
||||
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let ctx = &ctx;
|
||||
let timeline = tenant
|
||||
@@ -2245,6 +2252,15 @@ pub(crate) mod test {
|
||||
(k1, l1).cmp(&(k2, l2))
|
||||
}
|
||||
|
||||
pub(crate) fn sort_delta_value(
|
||||
(k1, l1, v1): &(Key, Lsn, Value),
|
||||
(k2, l2, v2): &(Key, Lsn, Value),
|
||||
) -> std::cmp::Ordering {
|
||||
let order_1 = if v1.is_image() { 0 } else { 1 };
|
||||
let order_2 = if v2.is_image() { 0 } else { 1 };
|
||||
(k1, l1, order_1).cmp(&(k2, l2, order_2))
|
||||
}
|
||||
|
||||
pub(crate) async fn produce_delta_layer(
|
||||
tenant: &Tenant,
|
||||
tline: &Arc<Timeline>,
|
||||
@@ -2253,7 +2269,7 @@ pub(crate) mod test {
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
deltas.sort_by(sort_delta);
|
||||
let (key_start, _, _) = deltas.first().unwrap();
|
||||
let (key_max, _, _) = deltas.first().unwrap();
|
||||
let (key_max, _, _) = deltas.last().unwrap();
|
||||
let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
|
||||
let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
|
||||
let lsn_end = Lsn(lsn_max.0 + 1);
|
||||
@@ -2298,10 +2314,7 @@ pub(crate) mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_layer_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("delta_layer_iterator").unwrap();
|
||||
let harness = TenantHarness::create("delta_layer_iterator").await.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
|
||||
@@ -29,13 +29,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::{
|
||||
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
@@ -50,6 +53,7 @@ use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
@@ -369,12 +373,10 @@ impl ImageLayer {
|
||||
}
|
||||
|
||||
impl ImageLayerInner {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn key_range(&self) -> &Range<Key> {
|
||||
&self.key_range
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
@@ -699,7 +701,6 @@ impl ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
@@ -708,9 +709,9 @@ impl ImageLayerInner {
|
||||
image_layer: self,
|
||||
ctx,
|
||||
index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
|
||||
key_values_batch: std::collections::VecDeque::new(),
|
||||
key_values_batch: VecDeque::new(),
|
||||
is_end: false,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
|
||||
planner: StreamingVectoredReadPlanner::new(
|
||||
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
|
||||
1024, // The default value. Unit tests might use a different value
|
||||
),
|
||||
@@ -737,6 +738,9 @@ struct ImageLayerWriterInner {
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
// Total uncompressed bytes passed into put_image
|
||||
uncompressed_bytes: u64,
|
||||
|
||||
blob_writer: BlobWriter<false>,
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
}
|
||||
@@ -792,6 +796,7 @@ impl ImageLayerWriterInner {
|
||||
lsn,
|
||||
tree: tree_builder,
|
||||
blob_writer,
|
||||
uncompressed_bytes: 0,
|
||||
};
|
||||
|
||||
Ok(writer)
|
||||
@@ -810,6 +815,7 @@ impl ImageLayerWriterInner {
|
||||
) -> anyhow::Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let compression = self.conf.image_compression;
|
||||
self.uncompressed_bytes += img.len() as u64;
|
||||
let (_img, res) = self
|
||||
.blob_writer
|
||||
.write_blob_maybe_compressed(img, ctx, compression)
|
||||
@@ -835,6 +841,11 @@ impl ImageLayerWriterInner {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
// Calculate compression ratio
|
||||
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
|
||||
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
|
||||
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
|
||||
|
||||
let mut file = self.blob_writer.into_inner();
|
||||
|
||||
// Write out the index
|
||||
@@ -974,17 +985,15 @@ impl Drop for ImageLayerWriter {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub struct ImageLayerIterator<'a> {
|
||||
image_layer: &'a ImageLayerInner,
|
||||
ctx: &'a RequestContext,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
|
||||
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
|
||||
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
|
||||
planner: StreamingVectoredReadPlanner,
|
||||
index_iter: DiskBtreeIterator<'a>,
|
||||
key_values_batch: VecDeque<(Key, Lsn, Value)>,
|
||||
is_end: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> ImageLayerIterator<'a> {
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
@@ -1102,6 +1111,7 @@ mod test {
|
||||
ShardIdentity::unsharded(),
|
||||
get_next_gen(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
@@ -1168,6 +1178,7 @@ mod test {
|
||||
// But here, all we care about is that the gen number is unique.
|
||||
get_next_gen(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
@@ -1299,7 +1310,7 @@ mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn image_layer_iterator() {
|
||||
let harness = TenantHarness::create("image_layer_iterator").unwrap();
|
||||
let harness = TenantHarness::create("image_layer_iterator").await.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
|
||||
@@ -385,6 +385,7 @@ impl Layer {
|
||||
}
|
||||
|
||||
/// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
@@ -693,6 +694,18 @@ impl Drop for LayerInner {
|
||||
// and we could be delaying shutdown for nothing.
|
||||
}
|
||||
|
||||
if let Some(timeline) = self.timeline.upgrade() {
|
||||
// Only need to decrement metrics if the timeline still exists: otherwise
|
||||
// it will have already de-registered these metrics via TimelineMetrics::shutdown
|
||||
if self.desc.is_delta() {
|
||||
timeline.metrics.layer_count_delta.dec();
|
||||
timeline.metrics.layer_size_delta.sub(self.desc.file_size);
|
||||
} else {
|
||||
timeline.metrics.layer_count_image.dec();
|
||||
timeline.metrics.layer_size_image.sub(self.desc.file_size);
|
||||
}
|
||||
}
|
||||
|
||||
if !*self.wanted_deleted.get_mut() {
|
||||
return;
|
||||
}
|
||||
@@ -791,6 +804,15 @@ impl LayerInner {
|
||||
(heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
|
||||
};
|
||||
|
||||
// This object acts as a RAII guard on these metrics: increment on construction
|
||||
if desc.is_delta() {
|
||||
timeline.metrics.layer_count_delta.inc();
|
||||
timeline.metrics.layer_size_delta.add(desc.file_size);
|
||||
} else {
|
||||
timeline.metrics.layer_count_image.inc();
|
||||
timeline.metrics.layer_size_image.add(desc.file_size);
|
||||
}
|
||||
|
||||
LayerInner {
|
||||
conf,
|
||||
debug_str: {
|
||||
@@ -1469,14 +1491,22 @@ impl LayerInner {
|
||||
let duration = SystemTime::now().duration_since(local_layer_mtime);
|
||||
match duration {
|
||||
Ok(elapsed) => {
|
||||
timeline
|
||||
.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.read()
|
||||
.unwrap()
|
||||
.observe(elapsed);
|
||||
let accessed = self.access_stats.accessed();
|
||||
if accessed {
|
||||
// Only layers used for reads contribute to our "low residence" metric that is used
|
||||
// to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed
|
||||
// to be rapidly evicted without contributing to this metric.
|
||||
timeline
|
||||
.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.read()
|
||||
.unwrap()
|
||||
.observe(elapsed);
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
residence_millis = elapsed.as_millis(),
|
||||
accessed,
|
||||
"evicted layer after known residence period"
|
||||
);
|
||||
}
|
||||
@@ -1889,7 +1919,7 @@ impl ResidentLayer {
|
||||
self.owner.metadata()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Cast the layer to a delta, return an error if it is an image layer.
|
||||
pub(crate) async fn get_as_delta(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
@@ -1901,7 +1931,7 @@ impl ResidentLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Cast the layer to an image, return an error if it is a delta layer.
|
||||
pub(crate) async fn get_as_image(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
|
||||
@@ -22,7 +22,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
|
||||
async fn smoke_test() {
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
|
||||
let h = TenantHarness::create("smoke_test").unwrap();
|
||||
let h = TenantHarness::create("smoke_test").await.unwrap();
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
let (tenant, _) = h.load().await;
|
||||
@@ -176,7 +176,9 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
|
||||
let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
|
||||
let h = TenantHarness::create("evict_and_wait_on_wanted_deleted")
|
||||
.await
|
||||
.unwrap();
|
||||
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
@@ -258,7 +260,9 @@ fn read_wins_pending_eviction() {
|
||||
rt.block_on(async move {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
|
||||
let h = TenantHarness::create("read_wins_pending_eviction")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
@@ -390,7 +394,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
rt.block_on(async move {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create(name).unwrap();
|
||||
let h = TenantHarness::create(name).await.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
@@ -559,8 +563,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h =
|
||||
TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
|
||||
let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
let timeline = tenant
|
||||
@@ -636,7 +641,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn evict_and_wait_does_not_wait_for_download() {
|
||||
// let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
|
||||
let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
@@ -733,7 +740,9 @@ async fn eviction_cancellation_on_drop() {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
|
||||
let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
|
||||
let h = TenantHarness::create("eviction_cancellation_on_drop")
|
||||
.await
|
||||
.unwrap();
|
||||
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
|
||||
@@ -96,15 +96,22 @@ impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
|
||||
impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
use std::cmp::Ordering;
|
||||
let a = self.peek_next_key_lsn();
|
||||
let b = other.peek_next_key_lsn();
|
||||
let a = self.peek_next_key_lsn_value();
|
||||
let b = other.peek_next_key_lsn_value();
|
||||
match (a, b) {
|
||||
(Some((k1, l1)), Some((k2, l2))) => {
|
||||
let loaded_1 = if self.is_loaded() { 1 } else { 0 };
|
||||
let loaded_2 = if other.is_loaded() { 1 } else { 0 };
|
||||
(Some((k1, l1, v1)), Some((k2, l2, v2))) => {
|
||||
fn map_value_to_num(val: &Option<&Value>) -> usize {
|
||||
match val {
|
||||
None => 0,
|
||||
Some(Value::Image(_)) => 1,
|
||||
Some(Value::WalRecord(_)) => 2,
|
||||
}
|
||||
}
|
||||
let order_1 = map_value_to_num(&v1);
|
||||
let order_2 = map_value_to_num(&v2);
|
||||
// When key_lsn are the same, the unloaded iter will always appear before the loaded one.
|
||||
// And note that we do a reverse at the end of the comparison, so it works with the max heap.
|
||||
(k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
|
||||
(k1, l1, order_1).cmp(&(k2, l2, order_2))
|
||||
}
|
||||
(Some(_), None) => Ordering::Less,
|
||||
(None, Some(_)) => Ordering::Greater,
|
||||
@@ -137,13 +144,16 @@ impl<'a> IteratorWrapper<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
|
||||
fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
|
||||
match self {
|
||||
Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
|
||||
Self::Loaded { iter } => iter
|
||||
.peek()
|
||||
.as_ref()
|
||||
.map(|(key, lsn, val)| (key, *lsn, Some(val))),
|
||||
Self::NotLoaded {
|
||||
first_key_lower_bound: (key, lsn),
|
||||
..
|
||||
} => Some((key, *lsn)),
|
||||
} => Some((key, *lsn, None)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -191,6 +201,13 @@ impl<'a> IteratorWrapper<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// A merge iterator over delta/image layer iterators. When duplicated records are
|
||||
/// found, the iterator will not perform any deduplication, and the caller should handle
|
||||
/// these situation. By saying duplicated records, there are many possibilities:
|
||||
/// * Two same delta at the same LSN.
|
||||
/// * Two same image at the same LSN.
|
||||
/// * Delta/image at the same LSN where the image has already applied the delta.
|
||||
/// The iterator will always put the image before the delta.
|
||||
pub struct MergeIterator<'a> {
|
||||
heap: BinaryHeap<IteratorWrapper<'a>>,
|
||||
}
|
||||
@@ -245,8 +262,9 @@ mod tests {
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
|
||||
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
|
||||
},
|
||||
walrecord::NeonWalRecord,
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
@@ -275,7 +293,9 @@ mod tests {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_merge_in_between").unwrap();
|
||||
let harness = TenantHarness::create("merge_iterator_merge_in_between")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
@@ -338,7 +358,9 @@ mod tests {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
|
||||
let harness = TenantHarness::create("merge_iterator_delta_merge")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
@@ -407,6 +429,133 @@ mod tests {
|
||||
// TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
|
||||
}
|
||||
|
||||
// TODO: image layer merge, delta+image mixed merge
|
||||
// TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
|
||||
#[tokio::test]
|
||||
async fn delta_image_mixed_merge() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
// In this test case, we want to test if the iterator still works correctly with multiple copies
|
||||
// of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab.
|
||||
// Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix.
|
||||
// An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation
|
||||
// could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation
|
||||
// one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should
|
||||
// correctly process these situations and return everything as-is, and the upper layer of the system
|
||||
// will handle duplicated LSNs.
|
||||
let test_deltas1 = vec![
|
||||
(
|
||||
get_key(0),
|
||||
Lsn(0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_init()),
|
||||
),
|
||||
(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("a")),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_init()),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x18),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("b")),
|
||||
),
|
||||
];
|
||||
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut test_deltas2 = test_deltas1.clone();
|
||||
test_deltas2.push((
|
||||
get_key(10),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||
));
|
||||
let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let test_deltas3 = vec![
|
||||
(
|
||||
get_key(0),
|
||||
Lsn(0x10),
|
||||
Value::Image(Bytes::copy_from_slice(b"")),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x18),
|
||||
Value::Image(Bytes::copy_from_slice(b"b")),
|
||||
),
|
||||
(
|
||||
get_key(15),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||
),
|
||||
];
|
||||
let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut test_deltas4 = test_deltas3.clone();
|
||||
test_deltas4.push((
|
||||
get_key(20),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||
));
|
||||
let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut expect = Vec::new();
|
||||
expect.extend(test_deltas1);
|
||||
expect.extend(test_deltas2);
|
||||
expect.extend(test_deltas3);
|
||||
expect.extend(test_deltas4);
|
||||
expect.sort_by(sort_delta_value);
|
||||
|
||||
// Test with different layer order for MergeIterator::create to ensure the order
|
||||
// is stable.
|
||||
|
||||
let mut merge_iter = MergeIterator::create(
|
||||
&[
|
||||
resident_layer_4.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_3.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
|
||||
],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
assert_merge_iter_equal(&mut merge_iter, &expect).await;
|
||||
|
||||
let mut merge_iter = MergeIterator::create(
|
||||
&[
|
||||
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_4.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_3.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
|
||||
],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
assert_merge_iter_equal(&mut merge_iter, &expect).await;
|
||||
|
||||
is_send(merge_iter);
|
||||
}
|
||||
|
||||
fn is_send(_: impl Send) {}
|
||||
}
|
||||
|
||||
@@ -69,6 +69,7 @@ use std::{
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
tenant::{
|
||||
config::defaults::DEFAULT_PITR_INTERVAL,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::TimelineMetadata,
|
||||
storage_layer::PersistentLayerDesc,
|
||||
@@ -197,7 +198,7 @@ impl PartialOrd for Hole {
|
||||
|
||||
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
||||
/// Can be removed after all refactors are done.
|
||||
fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
|
||||
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
|
||||
drop(rlock)
|
||||
}
|
||||
|
||||
@@ -270,7 +271,7 @@ pub struct Timeline {
|
||||
///
|
||||
/// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
|
||||
/// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
|
||||
pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
|
||||
pub(crate) layers: tokio::sync::RwLock<LayerManager>,
|
||||
|
||||
last_freeze_at: AtomicLsn,
|
||||
// Atomic would be more appropriate here.
|
||||
@@ -477,37 +478,32 @@ impl GcInfo {
|
||||
}
|
||||
}
|
||||
|
||||
/// The `GcInfo` component describing which Lsns need to be retained.
|
||||
/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this
|
||||
/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
|
||||
/// between time-based and space-based retention for observability and consumption metrics purposes.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct GcCutoffs {
|
||||
/// Keep everything newer than this point.
|
||||
///
|
||||
/// This is calculated by subtracting 'gc_horizon' setting from
|
||||
/// last-record LSN
|
||||
///
|
||||
/// FIXME: is this inclusive or exclusive?
|
||||
pub(crate) horizon: Lsn,
|
||||
/// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
|
||||
/// history we must keep to retain a specified number of bytes of WAL.
|
||||
pub(crate) space: Lsn,
|
||||
|
||||
/// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
|
||||
/// point.
|
||||
///
|
||||
/// This is calculated by finding a number such that a record is needed for PITR
|
||||
/// if only if its LSN is larger than 'pitr_cutoff'.
|
||||
pub(crate) pitr: Lsn,
|
||||
/// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much
|
||||
/// history we must keep to enable reading back at least the PITR interval duration.
|
||||
pub(crate) time: Lsn,
|
||||
}
|
||||
|
||||
impl Default for GcCutoffs {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
horizon: Lsn::INVALID,
|
||||
pitr: Lsn::INVALID,
|
||||
space: Lsn::INVALID,
|
||||
time: Lsn::INVALID,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GcCutoffs {
|
||||
fn select_min(&self) -> Lsn {
|
||||
std::cmp::min(self.horizon, self.pitr)
|
||||
std::cmp::min(self.space, self.time)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -866,7 +862,7 @@ impl Timeline {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let history = self
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(gc_info.cutoffs.pitr)
|
||||
.checked_sub(gc_info.cutoffs.time)
|
||||
.unwrap_or(Lsn(0))
|
||||
.0;
|
||||
(history, gc_info.within_ancestor_pitr)
|
||||
@@ -1565,7 +1561,7 @@ impl Timeline {
|
||||
) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
lsn >= **latest_gc_cutoff_lsn,
|
||||
"LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
|
||||
"LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)",
|
||||
lsn,
|
||||
**latest_gc_cutoff_lsn,
|
||||
);
|
||||
@@ -3408,6 +3404,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
|
||||
#[allow(clippy::doc_lazy_continuation)]
|
||||
/// Get the data needed to reconstruct all keys in the provided keyspace
|
||||
///
|
||||
@@ -4732,13 +4729,7 @@ impl Timeline {
|
||||
tenant: &crate::tenant::Tenant,
|
||||
options: detach_ancestor::Options,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<
|
||||
(
|
||||
completion::Completion,
|
||||
detach_ancestor::PreparedTimelineDetach,
|
||||
),
|
||||
detach_ancestor::Error,
|
||||
> {
|
||||
) -> Result<detach_ancestor::Progress, detach_ancestor::Error> {
|
||||
detach_ancestor::prepare(self, tenant, options, ctx).await
|
||||
}
|
||||
|
||||
@@ -4945,24 +4936,21 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Find the Lsns above which layer files need to be retained on
|
||||
/// garbage collection. This is separate from actually performing the GC,
|
||||
/// and is updated more frequently, so that compaction can remove obsolete
|
||||
/// page versions more aggressively.
|
||||
/// garbage collection.
|
||||
///
|
||||
/// TODO: that's wishful thinking, compaction doesn't actually do that
|
||||
/// currently.
|
||||
/// We calculate two cutoffs, one based on time and one based on WAL size. `pitr`
|
||||
/// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls
|
||||
/// the space-based retention.
|
||||
///
|
||||
/// The 'cutoff_horizon' point is used to retain recent versions that might still be
|
||||
/// needed by read-only nodes. (As of this writing, the caller just passes
|
||||
/// the latest LSN subtracted by a constant, and doesn't do anything smart
|
||||
/// to figure out what read-only nodes might actually need.)
|
||||
///
|
||||
/// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
|
||||
/// whether a record is needed for PITR.
|
||||
/// This function doesn't simply to calculate time & space based retention: it treats time-based
|
||||
/// retention as authoritative if enabled, and falls back to space-based retention if calculating
|
||||
/// the LSN for a time point isn't possible. Therefore the GcCutoffs::horizon in the response might
|
||||
/// be different to the `space_cutoff` input. Callers should treat the min() of the two cutoffs
|
||||
/// in the response as the GC cutoff point for the timeline.
|
||||
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
|
||||
pub(super) async fn find_gc_cutoffs(
|
||||
&self,
|
||||
cutoff_horizon: Lsn,
|
||||
space_cutoff: Lsn,
|
||||
pitr: Duration,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
@@ -4975,58 +4963,87 @@ impl Timeline {
|
||||
|
||||
pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
|
||||
|
||||
// First, calculate pitr_cutoff_timestamp and then convert it to LSN.
|
||||
//
|
||||
// Some unit tests depend on garbage-collection working even when
|
||||
// CLOG data is missing, so that find_lsn_for_timestamp() doesn't
|
||||
// work, so avoid calling it altogether if time-based retention is not
|
||||
// configured. It would be pointless anyway.
|
||||
let pitr_cutoff = if pitr != Duration::ZERO {
|
||||
let now = SystemTime::now();
|
||||
if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
|
||||
let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
|
||||
|
||||
match self
|
||||
.find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
|
||||
.await?
|
||||
{
|
||||
LsnForTimestamp::Present(lsn) => lsn,
|
||||
LsnForTimestamp::Future(lsn) => {
|
||||
// The timestamp is in the future. That sounds impossible,
|
||||
// but what it really means is that there hasn't been
|
||||
// any commits since the cutoff timestamp.
|
||||
//
|
||||
// In this case we should use the LSN of the most recent commit,
|
||||
// which is implicitly the last LSN in the log.
|
||||
debug!("future({})", lsn);
|
||||
self.get_last_record_lsn()
|
||||
}
|
||||
LsnForTimestamp::Past(lsn) => {
|
||||
debug!("past({})", lsn);
|
||||
// conservative, safe default is to remove nothing, when we
|
||||
// have no commit timestamp data available
|
||||
*self.get_latest_gc_cutoff_lsn()
|
||||
}
|
||||
LsnForTimestamp::NoData(lsn) => {
|
||||
debug!("nodata({})", lsn);
|
||||
// conservative, safe default is to remove nothing, when we
|
||||
// have no commit timestamp data available
|
||||
*self.get_latest_gc_cutoff_lsn()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If we don't have enough data to convert to LSN,
|
||||
// play safe and don't remove any layers.
|
||||
*self.get_latest_gc_cutoff_lsn()
|
||||
if cfg!(test) {
|
||||
// Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
|
||||
if pitr == Duration::ZERO {
|
||||
return Ok(GcCutoffs {
|
||||
time: self.get_last_record_lsn(),
|
||||
space: space_cutoff,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate a time-based limit on how much to retain:
|
||||
// - if PITR interval is set, then this is our cutoff.
|
||||
// - if PITR interval is not set, then we do a lookup
|
||||
// based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
|
||||
let time_cutoff = {
|
||||
let now = SystemTime::now();
|
||||
let time_range = if pitr == Duration::ZERO {
|
||||
humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
|
||||
} else {
|
||||
pitr
|
||||
};
|
||||
|
||||
// If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
|
||||
let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
|
||||
let timestamp = to_pg_timestamp(time_cutoff);
|
||||
|
||||
match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
|
||||
LsnForTimestamp::Present(lsn) => Some(lsn),
|
||||
LsnForTimestamp::Future(lsn) => {
|
||||
// The timestamp is in the future. That sounds impossible,
|
||||
// but what it really means is that there hasn't been
|
||||
// any commits since the cutoff timestamp.
|
||||
//
|
||||
// In this case we should use the LSN of the most recent commit,
|
||||
// which is implicitly the last LSN in the log.
|
||||
debug!("future({})", lsn);
|
||||
Some(self.get_last_record_lsn())
|
||||
}
|
||||
LsnForTimestamp::Past(lsn) => {
|
||||
debug!("past({})", lsn);
|
||||
None
|
||||
}
|
||||
LsnForTimestamp::NoData(lsn) => {
|
||||
debug!("nodata({})", lsn);
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No time-based retention was configured. Interpret this as "keep no history".
|
||||
self.get_last_record_lsn()
|
||||
};
|
||||
|
||||
Ok(GcCutoffs {
|
||||
horizon: cutoff_horizon,
|
||||
pitr: pitr_cutoff,
|
||||
Ok(match (pitr, time_cutoff) {
|
||||
(Duration::ZERO, Some(time_cutoff)) => {
|
||||
// PITR is not set. Retain the size-based limit, or the default time retention,
|
||||
// whichever requires less data.
|
||||
GcCutoffs {
|
||||
time: self.get_last_record_lsn(),
|
||||
space: std::cmp::max(time_cutoff, space_cutoff),
|
||||
}
|
||||
}
|
||||
(Duration::ZERO, None) => {
|
||||
// PITR is not set, and time lookup failed
|
||||
GcCutoffs {
|
||||
time: self.get_last_record_lsn(),
|
||||
space: space_cutoff,
|
||||
}
|
||||
}
|
||||
(_, None) => {
|
||||
// PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR
|
||||
// cannot advance beyond what was already GC'd, and respect space-based retention
|
||||
GcCutoffs {
|
||||
time: *self.get_latest_gc_cutoff_lsn(),
|
||||
space: space_cutoff,
|
||||
}
|
||||
}
|
||||
(_, Some(time_cutoff)) => {
|
||||
// PITR interval is set and we looked up timestamp successfully. Ignore
|
||||
// size based retention and make time cutoff authoritative
|
||||
GcCutoffs {
|
||||
time: time_cutoff,
|
||||
space: time_cutoff,
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -5051,11 +5068,11 @@ impl Timeline {
|
||||
return Err(GcError::TimelineCancelled);
|
||||
}
|
||||
|
||||
let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
|
||||
let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
|
||||
let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
|
||||
let pitr_cutoff = gc_info.cutoffs.pitr;
|
||||
let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
|
||||
let time_cutoff = gc_info.cutoffs.time;
|
||||
let retain_lsns = gc_info.retain_lsns.clone();
|
||||
|
||||
// Gets the maximum LSN that holds the valid lease.
|
||||
@@ -5065,14 +5082,14 @@ impl Timeline {
|
||||
let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);
|
||||
|
||||
(
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
space_cutoff,
|
||||
time_cutoff,
|
||||
retain_lsns,
|
||||
max_lsn_with_valid_lease,
|
||||
)
|
||||
};
|
||||
|
||||
let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
|
||||
let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
|
||||
let standby_horizon = self.standby_horizon.load();
|
||||
// Hold GC for the standby, but as a safety guard do it only within some
|
||||
// reasonable lag.
|
||||
@@ -5101,8 +5118,8 @@ impl Timeline {
|
||||
|
||||
let res = self
|
||||
.gc_timeline(
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
space_cutoff,
|
||||
time_cutoff,
|
||||
retain_lsns,
|
||||
max_lsn_with_valid_lease,
|
||||
new_gc_cutoff,
|
||||
@@ -5120,8 +5137,8 @@ impl Timeline {
|
||||
|
||||
async fn gc_timeline(
|
||||
&self,
|
||||
horizon_cutoff: Lsn,
|
||||
pitr_cutoff: Lsn,
|
||||
space_cutoff: Lsn,
|
||||
time_cutoff: Lsn,
|
||||
retain_lsns: Vec<Lsn>,
|
||||
max_lsn_with_valid_lease: Option<Lsn>,
|
||||
new_gc_cutoff: Lsn,
|
||||
@@ -5182,22 +5199,22 @@ impl Timeline {
|
||||
result.layers_total += 1;
|
||||
|
||||
// 1. Is it newer than GC horizon cutoff point?
|
||||
if l.get_lsn_range().end > horizon_cutoff {
|
||||
if l.get_lsn_range().end > space_cutoff {
|
||||
debug!(
|
||||
"keeping {} because it's newer than horizon_cutoff {}",
|
||||
"keeping {} because it's newer than space_cutoff {}",
|
||||
l.layer_name(),
|
||||
horizon_cutoff,
|
||||
space_cutoff,
|
||||
);
|
||||
result.layers_needed_by_cutoff += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
// 2. It is newer than PiTR cutoff point?
|
||||
if l.get_lsn_range().end > pitr_cutoff {
|
||||
if l.get_lsn_range().end > time_cutoff {
|
||||
debug!(
|
||||
"keeping {} because it's newer than pitr_cutoff {}",
|
||||
"keeping {} because it's newer than time_cutoff {}",
|
||||
l.layer_name(),
|
||||
pitr_cutoff,
|
||||
time_cutoff,
|
||||
);
|
||||
result.layers_needed_by_pitr += 1;
|
||||
continue 'outer;
|
||||
@@ -6029,8 +6046,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn two_layer_eviction_attempts_at_the_same_time() {
|
||||
let harness =
|
||||
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
|
||||
let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
|
||||
@@ -26,9 +26,11 @@ use utils::id::TimelineId;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
|
||||
use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
|
||||
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
use crate::tenant::DeltaLayer;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
@@ -195,7 +197,7 @@ impl Timeline {
|
||||
tracing::info!(
|
||||
"latest_gc_cutoff: {}, pitr cutoff {}",
|
||||
*latest_gc_cutoff,
|
||||
self.gc_info.read().unwrap().cutoffs.pitr
|
||||
self.gc_info.read().unwrap().cutoffs.time
|
||||
);
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
@@ -379,7 +381,7 @@ impl Timeline {
|
||||
};
|
||||
|
||||
let begin = tokio::time::Instant::now();
|
||||
let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
|
||||
let phase1_layers_locked = self.layers.read().await;
|
||||
let now = tokio::time::Instant::now();
|
||||
stats.read_lock_acquisition_micros =
|
||||
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
|
||||
@@ -399,9 +401,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
|
||||
async fn compact_level0_phase1(
|
||||
self: &Arc<Self>,
|
||||
guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
|
||||
async fn compact_level0_phase1<'a>(
|
||||
self: &'a Arc<Self>,
|
||||
guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
|
||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
@@ -415,6 +417,7 @@ impl Timeline {
|
||||
.map(|x| guard.get_from_desc(&x))
|
||||
.collect_vec();
|
||||
stats.level0_deltas_count = Some(level0_deltas.len());
|
||||
|
||||
// Only compact if enough layers have accumulated.
|
||||
let threshold = self.get_compaction_threshold();
|
||||
if level0_deltas.is_empty() || level0_deltas.len() < threshold {
|
||||
@@ -445,6 +448,22 @@ impl Timeline {
|
||||
let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
|
||||
let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
|
||||
|
||||
// Accumulate the size of layers in `deltas_to_compact`
|
||||
let mut deltas_to_compact_bytes = 0;
|
||||
|
||||
// Under normal circumstances, we will accumulate up to compaction_interval L0s of size
|
||||
// checkpoint_distance each. To avoid edge cases using extra system resources, bound our
|
||||
// work in this function to only operate on this much delta data at once.
|
||||
//
|
||||
// Take the max of the configured value & the default, so that tests that configure tiny values
|
||||
// can still use a sensible amount of memory, but if a deployed system configures bigger values we
|
||||
// still let them compact a full stack of L0s in one go.
|
||||
let delta_size_limit = std::cmp::max(
|
||||
self.get_compaction_threshold(),
|
||||
DEFAULT_COMPACTION_THRESHOLD,
|
||||
) as u64
|
||||
* std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
|
||||
|
||||
deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
|
||||
for l in level0_deltas_iter {
|
||||
let lsn_range = &l.layer_desc().lsn_range;
|
||||
@@ -453,7 +472,20 @@ impl Timeline {
|
||||
break;
|
||||
}
|
||||
deltas_to_compact.push(l.download_and_keep_resident().await?);
|
||||
deltas_to_compact_bytes += l.metadata().file_size;
|
||||
prev_lsn_end = lsn_range.end;
|
||||
|
||||
if deltas_to_compact_bytes >= delta_size_limit {
|
||||
info!(
|
||||
l0_deltas_selected = deltas_to_compact.len(),
|
||||
l0_deltas_total = level0_deltas.len(),
|
||||
"L0 compaction picker hit max delta layer size limit: {}",
|
||||
delta_size_limit
|
||||
);
|
||||
|
||||
// Proceed with compaction, but only a subset of L0s
|
||||
break;
|
||||
}
|
||||
}
|
||||
let lsn_range = Range {
|
||||
start: deltas_to_compact
|
||||
@@ -990,7 +1022,7 @@ impl Timeline {
|
||||
"enhanced legacy compaction currently does not support retain_lsns (branches)"
|
||||
)));
|
||||
}
|
||||
let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
|
||||
let gc_cutoff = gc_info.cutoffs.select_min();
|
||||
let mut selected_layers = Vec::new();
|
||||
// TODO: consider retain_lsns
|
||||
drop(gc_info);
|
||||
@@ -1008,10 +1040,12 @@ impl Timeline {
|
||||
);
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, collect the layer information to decide when to split the new delta layers.
|
||||
let mut all_key_values = Vec::new();
|
||||
let mut downloaded_layers = Vec::new();
|
||||
let mut delta_split_points = BTreeSet::new();
|
||||
for layer in &layer_selection {
|
||||
all_key_values.extend(layer.load_key_values(ctx).await?);
|
||||
let resident_layer = layer.download_and_keep_resident().await?;
|
||||
downloaded_layers.push(resident_layer);
|
||||
|
||||
let desc = layer.layer_desc();
|
||||
if desc.is_delta() {
|
||||
// TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
|
||||
@@ -1021,44 +1055,28 @@ impl Timeline {
|
||||
delta_split_points.insert(key_range.end);
|
||||
}
|
||||
}
|
||||
// Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
|
||||
// image layers, make image appear before than delta.
|
||||
struct ValueWrapper<'a>(&'a crate::repository::Value);
|
||||
impl Ord for ValueWrapper<'_> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
use crate::repository::Value;
|
||||
use std::cmp::Ordering;
|
||||
match (self.0, other.0) {
|
||||
(Value::Image(_), Value::WalRecord(_)) => Ordering::Less,
|
||||
(Value::WalRecord(_), Value::Image(_)) => Ordering::Greater,
|
||||
_ => Ordering::Equal,
|
||||
}
|
||||
let mut delta_layers = Vec::new();
|
||||
let mut image_layers = Vec::new();
|
||||
for resident_layer in &downloaded_layers {
|
||||
if resident_layer.layer_desc().is_delta() {
|
||||
let layer = resident_layer.get_as_delta(ctx).await?;
|
||||
delta_layers.push(layer);
|
||||
} else {
|
||||
let layer = resident_layer.get_as_image(ctx).await?;
|
||||
image_layers.push(layer);
|
||||
}
|
||||
}
|
||||
impl PartialOrd for ValueWrapper<'_> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
impl PartialEq for ValueWrapper<'_> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == std::cmp::Ordering::Equal
|
||||
}
|
||||
}
|
||||
impl Eq for ValueWrapper<'_> {}
|
||||
all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
|
||||
(k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
|
||||
});
|
||||
let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
|
||||
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
|
||||
// Data of the same key.
|
||||
let mut accumulated_values = Vec::new();
|
||||
let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
|
||||
let mut last_key: Option<Key> = None;
|
||||
|
||||
/// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
|
||||
async fn flush_accumulated_states(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
|
||||
accumulated_values: &[(Key, Lsn, crate::repository::Value)],
|
||||
horizon: Lsn,
|
||||
) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
|
||||
let mut base_image = None;
|
||||
@@ -1159,7 +1177,7 @@ impl Timeline {
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
|
||||
&(Key::MIN..Key::MAX), // covers the full key range
|
||||
gc_cutoff,
|
||||
ctx,
|
||||
)
|
||||
@@ -1169,20 +1187,24 @@ impl Timeline {
|
||||
let delta_split_points = delta_split_points.into_iter().collect_vec();
|
||||
let mut current_delta_split_point = 0;
|
||||
let mut delta_layers = Vec::new();
|
||||
for item @ (key, _, _) in &all_key_values {
|
||||
if &last_key == key {
|
||||
accumulated_values.push(item);
|
||||
while let Some((key, lsn, val)) = merge_iter.next().await? {
|
||||
if last_key.is_none() || last_key.as_ref() == Some(&key) {
|
||||
if last_key.is_none() {
|
||||
last_key = Some(key);
|
||||
}
|
||||
accumulated_values.push((key, lsn, val));
|
||||
} else {
|
||||
let last_key = last_key.as_mut().unwrap();
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
|
||||
flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
image_layer_writer.put_image(last_key, image, ctx).await?;
|
||||
image_layer_writer.put_image(*last_key, image, ctx).await?;
|
||||
delta_values.extend(deltas);
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
&mut delta_values,
|
||||
last_key,
|
||||
*last_key,
|
||||
&delta_split_points,
|
||||
&mut current_delta_split_point,
|
||||
self,
|
||||
@@ -1192,11 +1214,12 @@ impl Timeline {
|
||||
.await?,
|
||||
);
|
||||
accumulated_values.clear();
|
||||
accumulated_values.push(item);
|
||||
last_key = *key;
|
||||
*last_key = key;
|
||||
accumulated_values.push((key, lsn, val));
|
||||
}
|
||||
}
|
||||
|
||||
let last_key = last_key.expect("no keys produced during compaction");
|
||||
// TODO: move this part to the loop body
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
|
||||
|
||||
@@ -10,6 +10,7 @@ use crate::{
|
||||
},
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
};
|
||||
use pageserver_api::models::detach_ancestor::AncestorDetached;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
|
||||
@@ -39,6 +40,9 @@ pub(crate) enum Error {
|
||||
|
||||
#[error("unexpected error")]
|
||||
Unexpected(#[source] anyhow::Error),
|
||||
|
||||
#[error("failpoint: {}", .0)]
|
||||
Failpoint(&'static str),
|
||||
}
|
||||
|
||||
impl From<Error> for ApiError {
|
||||
@@ -57,11 +61,41 @@ impl From<Error> for ApiError {
|
||||
| e @ Error::CopyDeltaPrefix(_)
|
||||
| e @ Error::UploadRewritten(_)
|
||||
| e @ Error::CopyFailed(_)
|
||||
| e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
|
||||
| e @ Error::Unexpected(_)
|
||||
| e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::upload_queue::NotInitialized> for Error {
|
||||
fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self {
|
||||
// treat all as shutting down signals, even though that is not entirely correct
|
||||
// (uninitialized state)
|
||||
Error::ShuttingDown
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FlushLayerError> for Error {
|
||||
fn from(value: FlushLayerError) -> Self {
|
||||
match value {
|
||||
FlushLayerError::Cancelled => Error::ShuttingDown,
|
||||
FlushLayerError::NotRunning(_) => {
|
||||
// FIXME(#6424): technically statically unreachable right now, given how we never
|
||||
// drop the sender
|
||||
Error::ShuttingDown
|
||||
}
|
||||
FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => {
|
||||
Error::FlushAncestor(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum Progress {
|
||||
Prepared(completion::Completion, PreparedTimelineDetach),
|
||||
Done(AncestorDetached),
|
||||
}
|
||||
|
||||
pub(crate) struct PreparedTimelineDetach {
|
||||
layers: Vec<Layer>,
|
||||
}
|
||||
@@ -88,7 +122,7 @@ pub(super) async fn prepare(
|
||||
tenant: &Tenant,
|
||||
options: Options,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
|
||||
) -> Result<Progress, Error> {
|
||||
use Error::*;
|
||||
|
||||
let Some((ancestor, ancestor_lsn)) = detached
|
||||
@@ -96,15 +130,67 @@ pub(super) async fn prepare(
|
||||
.as_ref()
|
||||
.map(|tl| (tl.clone(), detached.ancestor_lsn))
|
||||
else {
|
||||
// TODO: check if we have already been detached; for this we need to read the stored data
|
||||
// on remote client, for that we need a follow-up which makes uploads cheaper and maintains
|
||||
// a projection of the commited data.
|
||||
{
|
||||
let accessor = detached.remote_client.initialized_upload_queue()?;
|
||||
|
||||
// we are safe to inspect the latest uploaded, because we can only witness this after
|
||||
// restart is complete and ancestor is no more.
|
||||
let latest = accessor.latest_uploaded_index_part();
|
||||
if !latest.lineage.is_detached_from_original_ancestor() {
|
||||
return Err(NoAncestor);
|
||||
}
|
||||
}
|
||||
|
||||
// detached has previously been detached; let's inspect each of the current timelines and
|
||||
// report back the timelines which have been reparented by our detach
|
||||
let mut all_direct_children = tenant
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
|
||||
.map(|tl| (tl.ancestor_lsn, tl.clone()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut any_shutdown = false;
|
||||
|
||||
all_direct_children.retain(
|
||||
|(_, tl)| match tl.remote_client.initialized_upload_queue() {
|
||||
Ok(accessor) => accessor
|
||||
.latest_uploaded_index_part()
|
||||
.lineage
|
||||
.is_reparented(),
|
||||
Err(_shutdownalike) => {
|
||||
// not 100% a shutdown, but let's bail early not to give inconsistent results in
|
||||
// sharded enviroment.
|
||||
any_shutdown = true;
|
||||
true
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
if any_shutdown {
|
||||
// it could be one or many being deleted; have client retry
|
||||
return Err(Error::ShuttingDown);
|
||||
}
|
||||
|
||||
let mut reparented = all_direct_children;
|
||||
// why this instead of hashset? there is a reason, but I've forgotten it many times.
|
||||
//
|
||||
// the error is wrong per openapi
|
||||
return Err(NoAncestor);
|
||||
// maybe if this was a hashset we would not be able to distinguish some race condition.
|
||||
reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
|
||||
|
||||
return Ok(Progress::Done(AncestorDetached {
|
||||
reparented_timelines: reparented
|
||||
.into_iter()
|
||||
.map(|(_, tl)| tl.timeline_id)
|
||||
.collect(),
|
||||
}));
|
||||
};
|
||||
|
||||
if !ancestor_lsn.is_valid() {
|
||||
// rare case, probably wouldn't even load
|
||||
tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
|
||||
return Err(NoAncestor);
|
||||
}
|
||||
|
||||
@@ -131,6 +217,15 @@ pub(super) async fn prepare(
|
||||
|
||||
let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
|
||||
|
||||
utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
|
||||
|
||||
fail::fail_point!(
|
||||
"timeline-detach-ancestor::before_starting_after_locking",
|
||||
|_| Err(Error::Failpoint(
|
||||
"timeline-detach-ancestor::before_starting_after_locking"
|
||||
))
|
||||
);
|
||||
|
||||
if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
|
||||
let span =
|
||||
tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
|
||||
@@ -151,7 +246,7 @@ pub(super) async fn prepare(
|
||||
}
|
||||
};
|
||||
|
||||
res.map_err(FlushAncestor)?;
|
||||
res?;
|
||||
|
||||
// we do not need to wait for uploads to complete but we do need `struct Layer`,
|
||||
// copying delta prefix is unsupported currently for `InMemoryLayer`.
|
||||
@@ -159,7 +254,7 @@ pub(super) async fn prepare(
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"froze and flushed the ancestor"
|
||||
);
|
||||
Ok(())
|
||||
Ok::<_, Error>(())
|
||||
}
|
||||
.instrument(span)
|
||||
.await?;
|
||||
@@ -283,7 +378,7 @@ pub(super) async fn prepare(
|
||||
|
||||
let prepared = PreparedTimelineDetach { layers: new_layers };
|
||||
|
||||
Ok((guard, prepared))
|
||||
Ok(Progress::Prepared(guard, prepared))
|
||||
}
|
||||
|
||||
fn partition_work(
|
||||
@@ -350,7 +445,11 @@ async fn copy_lsn_prefix(
|
||||
target_timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<ResidentLayer>, Error> {
|
||||
use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};
|
||||
use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown};
|
||||
|
||||
if target_timeline.cancel.is_cancelled() {
|
||||
return Err(ShuttingDown);
|
||||
}
|
||||
|
||||
tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
|
||||
|
||||
@@ -529,7 +628,7 @@ pub(super) async fn complete(
|
||||
match res {
|
||||
Ok(Some(timeline)) => {
|
||||
tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
|
||||
reparented.push(timeline.timeline_id);
|
||||
reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
|
||||
}
|
||||
Ok(None) => {
|
||||
// lets just ignore this for now. one or all reparented timelines could had
|
||||
@@ -551,5 +650,12 @@ pub(super) async fn complete(
|
||||
tracing::info!("failed to reparent some candidates");
|
||||
}
|
||||
|
||||
reparented.sort_unstable();
|
||||
|
||||
let reparented = reparented
|
||||
.into_iter()
|
||||
.map(|(_, timeline_id)| timeline_id)
|
||||
.collect();
|
||||
|
||||
Ok(reparented)
|
||||
}
|
||||
|
||||
@@ -1118,7 +1118,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_connection_no_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("no_connection_no_candidate")?;
|
||||
let harness = TenantHarness::create("no_connection_no_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1151,7 +1151,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn connection_no_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("connection_no_candidate")?;
|
||||
let harness = TenantHarness::create("connection_no_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1216,7 +1216,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_connection_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("no_connection_candidate")?;
|
||||
let harness = TenantHarness::create("no_connection_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1279,7 +1279,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
|
||||
let harness = TenantHarness::create("candidate_with_many_connection_failures").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1319,7 +1319,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
|
||||
let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -1385,7 +1385,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
|
||||
let harness =
|
||||
TenantHarness::create("timeout_connection_threshold_current_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -1448,7 +1449,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
|
||||
let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let new_lsn = Lsn(100_100).align();
|
||||
@@ -1550,7 +1551,7 @@ mod tests {
|
||||
// and pageserver should prefer to connect to it.
|
||||
let test_az = Some("test_az".to_owned());
|
||||
|
||||
let harness = TenantHarness::create("switch_to_same_availability_zone")?;
|
||||
let harness = TenantHarness::create("switch_to_same_availability_zone").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
state.conf.availability_zone.clone_from(&test_az);
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
|
||||
@@ -228,18 +228,20 @@ impl UploadQueue {
|
||||
Ok(self.initialized_mut().expect("we just set it"))
|
||||
}
|
||||
|
||||
pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
pub(crate) fn initialized_mut(
|
||||
&mut self,
|
||||
) -> Result<&mut UploadQueueInitialized, NotInitialized> {
|
||||
use UploadQueue::*;
|
||||
match self {
|
||||
Uninitialized => Err(NotInitialized::Uninitialized.into()),
|
||||
Uninitialized => Err(NotInitialized::Uninitialized),
|
||||
Initialized(x) => {
|
||||
if x.shutting_down {
|
||||
Err(NotInitialized::ShuttingDown.into())
|
||||
Err(NotInitialized::ShuttingDown)
|
||||
} else {
|
||||
Ok(x)
|
||||
}
|
||||
}
|
||||
Stopped(_) => Err(NotInitialized::Stopped.into()),
|
||||
Stopped(_) => Err(NotInitialized::Stopped),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -396,7 +396,6 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
|
||||
/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
|
||||
/// max_cnt constraints.
|
||||
#[cfg(test)]
|
||||
pub struct StreamingVectoredReadPlanner {
|
||||
read_builder: Option<VectoredReadBuilder>,
|
||||
// Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
|
||||
@@ -410,7 +409,6 @@ pub struct StreamingVectoredReadPlanner {
|
||||
cnt: usize,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl StreamingVectoredReadPlanner {
|
||||
pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
|
||||
assert!(max_cnt > 0);
|
||||
|
||||
@@ -1754,7 +1754,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_relsize() -> Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -1975,7 +1975,10 @@ mod tests {
|
||||
// and then created it again within the same layer.
|
||||
#[tokio::test]
|
||||
async fn test_drop_extend() -> Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_drop_extend")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -2046,7 +2049,10 @@ mod tests {
|
||||
// and then extended it again within the same layer.
|
||||
#[tokio::test]
|
||||
async fn test_truncate_extend() -> Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_truncate_extend")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -2188,7 +2194,7 @@ mod tests {
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
#[tokio::test]
|
||||
async fn test_large_rel() -> Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_large_rel").await?.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -2296,7 +2302,7 @@ mod tests {
|
||||
let startpoint = Lsn::from_hex("14AEC08").unwrap();
|
||||
let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
|
||||
|
||||
let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
|
||||
let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let remote_initdb_path =
|
||||
|
||||
54
patches/rum.patch
Normal file
54
patches/rum.patch
Normal file
@@ -0,0 +1,54 @@
|
||||
commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb
|
||||
Author: Anastasia Lubennikova <anastasia@neon.tech>
|
||||
Date: Mon Jul 15 12:31:56 2024 +0100
|
||||
|
||||
Neon: fix unlogged index build patch
|
||||
|
||||
diff --git a/src/ruminsert.c b/src/ruminsert.c
|
||||
index e8b209d..e89bf2a 100644
|
||||
--- a/src/ruminsert.c
|
||||
+++ b/src/ruminsert.c
|
||||
@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
|
||||
elog(ERROR, "index \"%s\" already contains data",
|
||||
RelationGetRelationName(index));
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_start_unlogged_build(index->rd_smgr);
|
||||
+#endif
|
||||
+
|
||||
initRumState(&buildstate.rumstate, index);
|
||||
buildstate.rumstate.isBuild = true;
|
||||
buildstate.indtuples = 0;
|
||||
@@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
|
||||
buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
|
||||
rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_finish_unlogged_build_phase_1(index->rd_smgr);
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
* Write index to xlog
|
||||
*/
|
||||
@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
|
||||
UnlockReleaseBuffer(buffer);
|
||||
}
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ {
|
||||
+#if PG_VERSION_NUM >= 160000
|
||||
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
|
||||
+#else
|
||||
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
|
||||
+#endif
|
||||
+
|
||||
+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
|
||||
+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
|
||||
+
|
||||
+ smgr_end_unlogged_build(index->rd_smgr);
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
* Return statistics
|
||||
*/
|
||||
13
poetry.lock
generated
13
poetry.lock
generated
@@ -2641,19 +2641,18 @@ pbr = "*"
|
||||
|
||||
[[package]]
|
||||
name = "setuptools"
|
||||
version = "65.5.1"
|
||||
version = "70.0.0"
|
||||
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"},
|
||||
{file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"},
|
||||
{file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
|
||||
{file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
|
||||
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
|
||||
testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
|
||||
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
|
||||
testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
|
||||
|
||||
[[package]]
|
||||
name = "six"
|
||||
|
||||
@@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
tracing.workspace = true
|
||||
typed-json.workspace = true
|
||||
url.workspace = true
|
||||
urlencoding.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -181,8 +181,9 @@ pub async fn worker(
|
||||
let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
|
||||
let rx = rx.map(RequestData::from);
|
||||
|
||||
let storage =
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?;
|
||||
let storage = GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.context("remote storage init")?;
|
||||
|
||||
let properties = WriterProperties::builder()
|
||||
.set_data_page_size_limit(config.parquet_upload_page_size)
|
||||
@@ -217,6 +218,7 @@ pub async fn worker(
|
||||
|
||||
let storage_disconnect =
|
||||
GenericRemoteStorage::from_config(&disconnect_events_storage_config)
|
||||
.await
|
||||
.context("remote storage for disconnect events init")?;
|
||||
let parquet_config_disconnect = parquet_config.clone();
|
||||
tokio::try_join!(
|
||||
@@ -545,7 +547,9 @@ mod tests {
|
||||
},
|
||||
timeout: std::time::Duration::from_secs(120),
|
||||
};
|
||||
let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
|
||||
let storage = GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
worker_inner(storage, rx, config).await.unwrap();
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ use hyper1::Response;
|
||||
use hyper1::StatusCode;
|
||||
use hyper1::{HeaderMap, Request};
|
||||
use pq_proto::StartupMessageParamsBuilder;
|
||||
use serde_json::json;
|
||||
use serde::Serialize;
|
||||
use serde_json::Value;
|
||||
use tokio::time;
|
||||
use tokio_postgres::error::DbError;
|
||||
@@ -32,6 +32,7 @@ use tokio_postgres::Transaction;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::error;
|
||||
use tracing::info;
|
||||
use typed_json::json;
|
||||
use url::Url;
|
||||
use utils::http::error::ApiError;
|
||||
|
||||
@@ -263,13 +264,8 @@ pub async fn handle(
|
||||
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
|
||||
_ => None,
|
||||
};
|
||||
fn get<'a, T: serde::Serialize>(
|
||||
db: Option<&'a DbError>,
|
||||
x: impl FnOnce(&'a DbError) -> T,
|
||||
) -> Value {
|
||||
db.map(x)
|
||||
.and_then(|t| serde_json::to_value(t).ok())
|
||||
.unwrap_or_default()
|
||||
fn get<'a, T: Default>(db: Option<&'a DbError>, x: impl FnOnce(&'a DbError) -> T) -> T {
|
||||
db.map(x).unwrap_or_default()
|
||||
}
|
||||
|
||||
if let Some(db_error) = db_error {
|
||||
@@ -278,17 +274,11 @@ pub async fn handle(
|
||||
|
||||
let position = db_error.and_then(|db| db.position());
|
||||
let (position, internal_position, internal_query) = match position {
|
||||
Some(ErrorPosition::Original(position)) => (
|
||||
Value::String(position.to_string()),
|
||||
Value::Null,
|
||||
Value::Null,
|
||||
),
|
||||
Some(ErrorPosition::Internal { position, query }) => (
|
||||
Value::Null,
|
||||
Value::String(position.to_string()),
|
||||
Value::String(query.clone()),
|
||||
),
|
||||
None => (Value::Null, Value::Null, Value::Null),
|
||||
Some(ErrorPosition::Original(position)) => (Some(position.to_string()), None, None),
|
||||
Some(ErrorPosition::Internal { position, query }) => {
|
||||
(None, Some(position.to_string()), Some(query.clone()))
|
||||
}
|
||||
None => (None, None, None),
|
||||
};
|
||||
|
||||
let code = get(db_error, |db| db.code().code());
|
||||
@@ -578,10 +568,8 @@ async fn handle_inner(
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, "application/json");
|
||||
|
||||
//
|
||||
// Now execute the query and return the result
|
||||
//
|
||||
let result = match payload {
|
||||
// Now execute the query and return the result.
|
||||
let json_output = match payload {
|
||||
Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
|
||||
Payload::Batch(statements) => {
|
||||
if parsed_headers.txn_read_only {
|
||||
@@ -605,11 +593,9 @@ async fn handle_inner(
|
||||
|
||||
let metrics = client.metrics();
|
||||
|
||||
// how could this possibly fail
|
||||
let body = serde_json::to_string(&result).expect("json serialization should not fail");
|
||||
let len = body.len();
|
||||
let len = json_output.len();
|
||||
let response = response
|
||||
.body(Full::new(Bytes::from(body)))
|
||||
.body(Full::new(Bytes::from(json_output)))
|
||||
// only fails if invalid status code or invalid header/values are given.
|
||||
// these are not user configurable so it cannot fail dynamically
|
||||
.expect("building response payload should not fail");
|
||||
@@ -631,7 +617,7 @@ impl QueryData {
|
||||
cancel: CancellationToken,
|
||||
client: &mut Client<tokio_postgres::Client>,
|
||||
parsed_headers: HttpHeaders,
|
||||
) -> Result<Value, SqlOverHttpError> {
|
||||
) -> Result<String, SqlOverHttpError> {
|
||||
let (inner, mut discard) = client.inner();
|
||||
let cancel_token = inner.cancel_token();
|
||||
|
||||
@@ -644,7 +630,10 @@ impl QueryData {
|
||||
// The query successfully completed.
|
||||
Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
|
||||
discard.check_idle(status);
|
||||
Ok(results)
|
||||
|
||||
let json_output =
|
||||
serde_json::to_string(&results).expect("json serialization should not fail");
|
||||
Ok(json_output)
|
||||
}
|
||||
// The query failed with an error
|
||||
Either::Left((Err(e), __not_yet_cancelled)) => {
|
||||
@@ -662,7 +651,10 @@ impl QueryData {
|
||||
// query successed before it was cancelled.
|
||||
Ok(Ok((status, results))) => {
|
||||
discard.check_idle(status);
|
||||
Ok(results)
|
||||
|
||||
let json_output = serde_json::to_string(&results)
|
||||
.expect("json serialization should not fail");
|
||||
Ok(json_output)
|
||||
}
|
||||
// query failed or was cancelled.
|
||||
Ok(Err(error)) => {
|
||||
@@ -696,7 +688,7 @@ impl BatchQueryData {
|
||||
cancel: CancellationToken,
|
||||
client: &mut Client<tokio_postgres::Client>,
|
||||
parsed_headers: HttpHeaders,
|
||||
) -> Result<Value, SqlOverHttpError> {
|
||||
) -> Result<String, SqlOverHttpError> {
|
||||
info!("starting transaction");
|
||||
let (inner, mut discard) = client.inner();
|
||||
let cancel_token = inner.cancel_token();
|
||||
@@ -718,9 +710,9 @@ impl BatchQueryData {
|
||||
e
|
||||
})?;
|
||||
|
||||
let results =
|
||||
let json_output =
|
||||
match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
|
||||
Ok(results) => {
|
||||
Ok(json_output) => {
|
||||
info!("commit");
|
||||
let status = transaction.commit().await.map_err(|e| {
|
||||
// if we cannot commit - for now don't return connection to pool
|
||||
@@ -729,7 +721,7 @@ impl BatchQueryData {
|
||||
e
|
||||
})?;
|
||||
discard.check_idle(status);
|
||||
results
|
||||
json_output
|
||||
}
|
||||
Err(SqlOverHttpError::Cancelled(_)) => {
|
||||
if let Err(err) = cancel_token.cancel_query(NoTls).await {
|
||||
@@ -753,7 +745,7 @@ impl BatchQueryData {
|
||||
}
|
||||
};
|
||||
|
||||
Ok(json!({ "results": results }))
|
||||
Ok(json_output)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -762,7 +754,7 @@ async fn query_batch(
|
||||
transaction: &Transaction<'_>,
|
||||
queries: BatchQueryData,
|
||||
parsed_headers: HttpHeaders,
|
||||
) -> Result<Vec<Value>, SqlOverHttpError> {
|
||||
) -> Result<String, SqlOverHttpError> {
|
||||
let mut results = Vec::with_capacity(queries.queries.len());
|
||||
let mut current_size = 0;
|
||||
for stmt in queries.queries {
|
||||
@@ -787,7 +779,11 @@ async fn query_batch(
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(results)
|
||||
|
||||
let results = json!({ "results": results });
|
||||
let json_output = serde_json::to_string(&results).expect("json serialization should not fail");
|
||||
|
||||
Ok(json_output)
|
||||
}
|
||||
|
||||
async fn query_to_json<T: GenericClient>(
|
||||
@@ -795,7 +791,7 @@ async fn query_to_json<T: GenericClient>(
|
||||
data: QueryData,
|
||||
current_size: &mut usize,
|
||||
parsed_headers: HttpHeaders,
|
||||
) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
|
||||
) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> {
|
||||
info!("executing query");
|
||||
let query_params = data.params;
|
||||
let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
|
||||
@@ -844,8 +840,8 @@ async fn query_to_json<T: GenericClient>(
|
||||
|
||||
for c in row_stream.columns() {
|
||||
fields.push(json!({
|
||||
"name": Value::String(c.name().to_owned()),
|
||||
"dataTypeID": Value::Number(c.type_().oid().into()),
|
||||
"name": c.name().to_owned(),
|
||||
"dataTypeID": c.type_().oid(),
|
||||
"tableID": c.table_oid(),
|
||||
"columnID": c.column_id(),
|
||||
"dataTypeSize": c.type_size(),
|
||||
@@ -863,15 +859,14 @@ async fn query_to_json<T: GenericClient>(
|
||||
.map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
// resulting JSON format is based on the format of node-postgres result
|
||||
Ok((
|
||||
ready,
|
||||
json!({
|
||||
"command": command_tag_name,
|
||||
"rowCount": command_tag_count,
|
||||
"rows": rows,
|
||||
"fields": fields,
|
||||
"rowAsArray": array_mode,
|
||||
}),
|
||||
))
|
||||
// Resulting JSON format is based on the format of node-postgres result.
|
||||
let results = json!({
|
||||
"command": command_tag_name.to_string(),
|
||||
"rowCount": command_tag_count,
|
||||
"rows": rows,
|
||||
"fields": fields,
|
||||
"rowAsArray": array_mode,
|
||||
});
|
||||
|
||||
Ok((ready, results))
|
||||
}
|
||||
|
||||
@@ -357,11 +357,15 @@ pub async fn task_backup(
|
||||
info!("metrics backup has shut down");
|
||||
}
|
||||
// Even if the remote storage is not configured, we still want to clear the metrics.
|
||||
let storage = backup_config
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.map(|config| GenericRemoteStorage::from_config(config).context("remote storage init"))
|
||||
.transpose()?;
|
||||
let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() {
|
||||
Some(
|
||||
GenericRemoteStorage::from_config(config)
|
||||
.await
|
||||
.context("remote storage init")?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let mut ticker = tokio::time::interval(backup_config.interval);
|
||||
let mut prev = Utc::now();
|
||||
let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
|
||||
|
||||
@@ -12,13 +12,15 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
(Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Safekeeper auth",
|
||||
claims.scope
|
||||
)
|
||||
.into(),
|
||||
)),
|
||||
(Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => {
|
||||
Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Safekeeper auth",
|
||||
claims.scope
|
||||
)
|
||||
.into(),
|
||||
))
|
||||
}
|
||||
(Scope::SafekeeperData, _) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -418,7 +418,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
|
||||
metrics::register_internal(Box::new(timeline_collector))?;
|
||||
|
||||
wal_backup::init_remote_storage(&conf);
|
||||
wal_backup::init_remote_storage(&conf).await;
|
||||
|
||||
// Keep handles to main tasks to die if any of them disappears.
|
||||
let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
|
||||
|
||||
@@ -74,10 +74,16 @@ pub async fn handle_request(request: Request) -> Result<()> {
|
||||
assert!(flush_lsn >= start_lsn);
|
||||
|
||||
if request.until_lsn > flush_lsn {
|
||||
bail!("requested LSN is beyond the end of the timeline");
|
||||
bail!(format!(
|
||||
"requested LSN {} is beyond the end of the timeline {}",
|
||||
request.until_lsn, flush_lsn
|
||||
));
|
||||
}
|
||||
if request.until_lsn < start_lsn {
|
||||
bail!("requested LSN is before the start of the timeline");
|
||||
bail!(format!(
|
||||
"requested LSN {} is before the start of the timeline {}",
|
||||
request.until_lsn, start_lsn
|
||||
));
|
||||
}
|
||||
|
||||
if request.until_lsn > commit_lsn {
|
||||
|
||||
@@ -173,15 +173,6 @@ pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
.expect("Failed to create broker runtime")
|
||||
});
|
||||
|
||||
pub static WAL_REMOVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("WAL remover")
|
||||
.worker_threads(1)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create broker runtime")
|
||||
});
|
||||
|
||||
pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("WAL backup worker")
|
||||
@@ -189,12 +180,3 @@ pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
.build()
|
||||
.expect("Failed to create WAL backup runtime")
|
||||
});
|
||||
|
||||
pub static METRICS_SHIFTER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("metric shifter")
|
||||
.worker_threads(1)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create broker runtime")
|
||||
});
|
||||
|
||||
@@ -199,10 +199,7 @@ async fn redownload_partial_segment(
|
||||
file.flush().await?;
|
||||
|
||||
let final_path = local_segment_path(mgr, partial);
|
||||
info!(
|
||||
"downloaded {} bytes, renaming to {}",
|
||||
final_path, final_path,
|
||||
);
|
||||
info!("downloaded {actual_len} bytes, renaming to {final_path}");
|
||||
if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await {
|
||||
// Probably rename succeeded, but fsync of it failed. Remove
|
||||
// the file then to avoid using it.
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use tracing::{debug, warn};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::timeline_manager::ManagerCtlMessage;
|
||||
|
||||
@@ -23,7 +23,7 @@ impl Drop for ResidenceGuard {
|
||||
.manager_tx
|
||||
.send(ManagerCtlMessage::GuardDrop(self.guard_id));
|
||||
if let Err(e) = res {
|
||||
warn!("failed to send GuardDrop message: {:?}", e);
|
||||
debug!("failed to send GuardDrop message: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ use tokio::fs::File;
|
||||
|
||||
use tokio::select;
|
||||
use tokio::sync::mpsc::{self, Receiver, Sender};
|
||||
use tokio::sync::watch;
|
||||
use tokio::sync::{watch, OnceCell};
|
||||
use tokio::time::sleep;
|
||||
use tracing::*;
|
||||
|
||||
@@ -33,8 +33,6 @@ use crate::timeline::{PeerInfo, WalResidentTimeline};
|
||||
use crate::timeline_manager::{Manager, StateSnapshot};
|
||||
use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
|
||||
const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
|
||||
const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
|
||||
|
||||
@@ -167,7 +165,7 @@ fn determine_offloader(
|
||||
}
|
||||
}
|
||||
|
||||
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
|
||||
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::const_new();
|
||||
|
||||
// Storage must be configured and initialized when this is called.
|
||||
fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
|
||||
@@ -178,14 +176,22 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn init_remote_storage(conf: &SafeKeeperConf) {
|
||||
pub async fn init_remote_storage(conf: &SafeKeeperConf) {
|
||||
// TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
|
||||
// dependencies to all tasks instead.
|
||||
REMOTE_STORAGE.get_or_init(|| {
|
||||
conf.remote_storage
|
||||
.as_ref()
|
||||
.map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
|
||||
});
|
||||
REMOTE_STORAGE
|
||||
.get_or_init(|| async {
|
||||
if let Some(conf) = conf.remote_storage.as_ref() {
|
||||
Some(
|
||||
GenericRemoteStorage::from_config(conf)
|
||||
.await
|
||||
.expect("failed to create remote storage"),
|
||||
)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
struct WalBackupTask {
|
||||
|
||||
@@ -289,6 +289,18 @@ impl PartialBackup {
|
||||
})
|
||||
.collect();
|
||||
|
||||
if new_segments.len() == 1 {
|
||||
// we have an uploaded segment, it must not be deleted from remote storage
|
||||
segments_to_delete.retain(|name| name != &new_segments[0].name);
|
||||
} else {
|
||||
// there should always be zero or one uploaded segment
|
||||
assert!(
|
||||
new_segments.is_empty(),
|
||||
"too many uploaded segments: {:?}",
|
||||
new_segments
|
||||
);
|
||||
}
|
||||
|
||||
info!("deleting objects: {:?}", segments_to_delete);
|
||||
let mut objects_to_delete = vec![];
|
||||
for seg in segments_to_delete.iter() {
|
||||
|
||||
23
storage_controller/client/Cargo.toml
Normal file
23
storage_controller/client/Cargo.toml
Normal file
@@ -0,0 +1,23 @@
|
||||
[package]
|
||||
name = "storage_controller_client"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
thiserror.workspace = true
|
||||
async-trait.workspace = true
|
||||
reqwest.workspace = true
|
||||
utils.workspace = true
|
||||
serde.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio.workspace = true
|
||||
futures.workspace = true
|
||||
tokio-util.workspace = true
|
||||
anyhow.workspace = true
|
||||
postgres.workspace = true
|
||||
bytes.workspace = true
|
||||
62
storage_controller/client/src/control_api.rs
Normal file
62
storage_controller/client/src/control_api.rs
Normal file
@@ -0,0 +1,62 @@
|
||||
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
||||
use reqwest::{Method, Url};
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
use std::str::FromStr;
|
||||
|
||||
pub struct Client {
|
||||
base_url: Url,
|
||||
jwt_token: Option<String>,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub fn new(base_url: Url, jwt_token: Option<String>) -> Self {
|
||||
Self {
|
||||
base_url,
|
||||
jwt_token,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into storage controller
|
||||
pub async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> mgmt_api::Result<RS>
|
||||
where
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
self.base_url.host_str().unwrap(),
|
||||
self.base_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
builder = builder.json(&body)
|
||||
}
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
builder = builder.header(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
format!("Bearer {jwt_token}"),
|
||||
);
|
||||
}
|
||||
|
||||
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
|
||||
let response = response.error_from_body().await?;
|
||||
|
||||
response
|
||||
.json()
|
||||
.await
|
||||
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
1
storage_controller/client/src/lib.rs
Normal file
1
storage_controller/client/src/lib.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod control_api;
|
||||
@@ -330,6 +330,22 @@ async fn handle_tenant_timeline_delete(
|
||||
.await
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_detach_ancestor(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
|
||||
let res = service
|
||||
.tenant_timeline_detach_ancestor(tenant_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, res)
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_passthrough(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
@@ -414,7 +430,7 @@ async fn handle_tenant_describe(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
check_permissions(&req, Scope::Scrubber)?;
|
||||
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
|
||||
@@ -1006,6 +1022,16 @@ pub fn make_router(
|
||||
RequestName("v1_tenant_timeline"),
|
||||
)
|
||||
})
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_timeline_detach_ancestor,
|
||||
RequestName("v1_tenant_timeline_detach_ancestor"),
|
||||
)
|
||||
},
|
||||
)
|
||||
// Tenant detail GET passthrough to shard zero:
|
||||
.get("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use diesel::Connection;
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use metrics::BuildInfo;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use storage_controller::http::make_router;
|
||||
use storage_controller::metrics::preinitialize_metrics;
|
||||
use storage_controller::persistence::Persistence;
|
||||
@@ -51,10 +51,6 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
compute_hook_url: Option<String>,
|
||||
|
||||
/// Path to the .json file to store state (will be created if it doesn't exist)
|
||||
#[arg(short, long)]
|
||||
path: Option<Utf8PathBuf>,
|
||||
|
||||
/// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
|
||||
#[arg(long)]
|
||||
database_url: Option<String>,
|
||||
@@ -206,11 +202,10 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
|
||||
let args = Cli::parse();
|
||||
tracing::info!(
|
||||
"version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
|
||||
"version: {}, launch_timestamp: {}, build_tag {}, listening on {}",
|
||||
GIT_VERSION,
|
||||
launch_ts.to_string(),
|
||||
BUILD_TAG,
|
||||
args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
|
||||
args.listen
|
||||
);
|
||||
|
||||
@@ -277,8 +272,7 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
.await
|
||||
.context("Running database migrations")?;
|
||||
|
||||
let json_path = args.path;
|
||||
let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));
|
||||
let persistence = Arc::new(Persistence::new(secrets.database_url));
|
||||
|
||||
let service = Service::spawn(config, persistence.clone()).await?;
|
||||
|
||||
@@ -316,22 +310,23 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
}
|
||||
tracing::info!("Terminating on signal");
|
||||
|
||||
if json_path.is_some() {
|
||||
// Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
|
||||
// full postgres dumps around.
|
||||
if let Err(e) = persistence.write_tenants_json().await {
|
||||
tracing::error!("Failed to write JSON on shutdown: {e}")
|
||||
// Stop HTTP server first, so that we don't have to service requests
|
||||
// while shutting down Service.
|
||||
server_shutdown.cancel();
|
||||
match tokio::time::timeout(Duration::from_secs(5), server_task).await {
|
||||
Ok(Ok(_)) => {
|
||||
tracing::info!("Joined HTTP server task");
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
tracing::error!("Error joining HTTP server task: {e}")
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::warn!("Timed out joining HTTP server task");
|
||||
// We will fall through and shut down the service anyway, any request handlers
|
||||
// in flight will experience cancellation & their clients will see a torn connection.
|
||||
}
|
||||
}
|
||||
|
||||
// Stop HTTP server first, so that we don't have to service requests
|
||||
// while shutting down Service
|
||||
server_shutdown.cancel();
|
||||
if let Err(e) = server_task.await {
|
||||
tracing::error!("Error joining HTTP server task: {e}")
|
||||
}
|
||||
tracing::info!("Joined HTTP server task");
|
||||
|
||||
service.shutdown().await;
|
||||
tracing::info!("Service shutdown complete");
|
||||
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
|
||||
TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
|
||||
detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse,
|
||||
PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||
TopTenantShardsRequest, TopTenantShardsResponse,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
@@ -226,6 +227,21 @@ impl PageserverClient {
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn timeline_detach_ancestor(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<AncestorDetached> {
|
||||
measured_request!(
|
||||
"timeline_detach_ancestor",
|
||||
crate::metrics::Method::Put,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.timeline_detach_ancestor(tenant_shard_id, timeline_id)
|
||||
.await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||
measured_request!(
|
||||
"utilization",
|
||||
|
||||
@@ -5,8 +5,6 @@ use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use self::split_state::SplitState;
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
@@ -55,11 +53,6 @@ use crate::node::Node;
|
||||
/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
|
||||
pub struct Persistence {
|
||||
connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
|
||||
|
||||
// In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of
|
||||
// test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
|
||||
// compatible just yet.
|
||||
json_path: Option<Utf8PathBuf>,
|
||||
}
|
||||
|
||||
/// Legacy format, for use in JSON compat objects in test environment
|
||||
@@ -124,7 +117,7 @@ impl Persistence {
|
||||
const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
|
||||
|
||||
pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
|
||||
pub fn new(database_url: String) -> Self {
|
||||
let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
|
||||
|
||||
// We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
|
||||
@@ -139,10 +132,7 @@ impl Persistence {
|
||||
.build(manager)
|
||||
.expect("Could not build connection pool");
|
||||
|
||||
Self {
|
||||
connection_pool,
|
||||
json_path,
|
||||
}
|
||||
Self { connection_pool }
|
||||
}
|
||||
|
||||
/// A helper for use during startup, where we would like to tolerate concurrent restarts of the
|
||||
@@ -302,85 +292,13 @@ impl Persistence {
|
||||
/// At startup, load the high level state for shards, such as their config + policy. This will
|
||||
/// be enriched at runtime with state discovered on pageservers.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
let loaded = self
|
||||
.with_measured_conn(
|
||||
DatabaseOperation::ListTenantShards,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
if loaded.is_empty() {
|
||||
if let Some(path) = &self.json_path {
|
||||
if tokio::fs::try_exists(path)
|
||||
.await
|
||||
.map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
|
||||
{
|
||||
tracing::info!("Importing from legacy JSON format at {path}");
|
||||
return self.list_tenant_shards_json(path).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(loaded)
|
||||
}
|
||||
|
||||
/// Shim for automated compatibility tests: load tenants from a JSON file instead of database
|
||||
pub(crate) async fn list_tenant_shards_json(
|
||||
&self,
|
||||
path: &Utf8Path,
|
||||
) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
let bytes = tokio::fs::read(path)
|
||||
.await
|
||||
.map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
|
||||
|
||||
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
|
||||
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
|
||||
for shard in decoded.tenants.values_mut() {
|
||||
if shard.placement_policy == "\"Single\"" {
|
||||
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
|
||||
shard.placement_policy = "{\"Attached\":0}".to_string();
|
||||
}
|
||||
|
||||
if shard.scheduling_policy.is_empty() {
|
||||
shard.scheduling_policy =
|
||||
serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
|
||||
|
||||
// Synchronize database with what is in the JSON file
|
||||
self.insert_tenant_shards(tenants.clone()).await?;
|
||||
|
||||
Ok(tenants)
|
||||
}
|
||||
|
||||
/// For use in testing environments, where we dump out JSON on shutdown.
|
||||
pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
|
||||
let Some(path) = &self.json_path else {
|
||||
anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
|
||||
};
|
||||
tracing::info!("Writing state to {path}...");
|
||||
let tenants = self.list_tenant_shards().await?;
|
||||
let mut tenants_map = HashMap::new();
|
||||
for tsp in tenants {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount::new(tsp.shard_count as u8),
|
||||
};
|
||||
|
||||
tenants_map.insert(tenant_shard_id, tsp);
|
||||
}
|
||||
let json = serde_json::to_string(&JsonPersistence {
|
||||
tenants: tenants_map,
|
||||
})?;
|
||||
|
||||
tokio::fs::write(path, &json).await?;
|
||||
tracing::info!("Wrote {} bytes to {path}...", json.len());
|
||||
|
||||
Ok(())
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::ListTenantShards,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Tenants must be persisted before we schedule them for the first time. This enables us
|
||||
|
||||
@@ -117,6 +117,7 @@ enum TenantOperations {
|
||||
TimelineCreate,
|
||||
TimelineDelete,
|
||||
AttachHook,
|
||||
TimelineDetachAncestor,
|
||||
}
|
||||
|
||||
#[derive(Clone, strum_macros::Display)]
|
||||
@@ -2376,18 +2377,18 @@ impl Service {
|
||||
tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
|
||||
|
||||
client
|
||||
.tenant_time_travel_remote_storage(
|
||||
tenant_shard_id,
|
||||
×tamp,
|
||||
&done_if_after,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
|
||||
node
|
||||
))
|
||||
})?;
|
||||
.tenant_time_travel_remote_storage(
|
||||
tenant_shard_id,
|
||||
×tamp,
|
||||
&done_if_after,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
|
||||
node
|
||||
))
|
||||
})?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -2757,7 +2758,7 @@ impl Service {
|
||||
// Create timeline on remaining shards with number >0
|
||||
if !targets.is_empty() {
|
||||
// If we had multiple shards, issue requests for the remainder now.
|
||||
let jwt = self.config.jwt_token.clone();
|
||||
let jwt = &self.config.jwt_token;
|
||||
self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
|
||||
let create_req = create_req.clone();
|
||||
Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
|
||||
@@ -2768,6 +2769,114 @@ impl Service {
|
||||
Ok(timeline_info)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_timeline_detach_ancestor(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<models::detach_ancestor::AncestorDetached, ApiError> {
|
||||
tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",);
|
||||
|
||||
let _tenant_lock = trace_shared_lock(
|
||||
&self.tenant_op_locks,
|
||||
tenant_id,
|
||||
TenantOperations::TimelineDetachAncestor,
|
||||
)
|
||||
.await;
|
||||
|
||||
self.ensure_attached_wait(tenant_id).await?;
|
||||
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut targets = Vec::new();
|
||||
|
||||
for (tenant_shard_id, shard) in
|
||||
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
let node_id = shard.intent.get_attached().ok_or_else(|| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
|
||||
})?;
|
||||
let node = locked
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
|
||||
targets.push((*tenant_shard_id, node.clone()));
|
||||
}
|
||||
targets
|
||||
};
|
||||
|
||||
if targets.is_empty() {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant not found").into(),
|
||||
));
|
||||
}
|
||||
|
||||
async fn detach_one(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
node: Node,
|
||||
jwt: Option<String>,
|
||||
) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> {
|
||||
tracing::info!(
|
||||
"Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
|
||||
);
|
||||
|
||||
let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
|
||||
client
|
||||
.timeline_detach_ancestor(tenant_shard_id, timeline_id)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
use mgmt_api::Error;
|
||||
|
||||
match e {
|
||||
// no ancestor (ever)
|
||||
Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!(
|
||||
"{node}: {}",
|
||||
msg.strip_prefix("Conflict: ").unwrap_or(&msg)
|
||||
)),
|
||||
// too many ancestors
|
||||
Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
|
||||
ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
|
||||
}
|
||||
// rest can be mapped
|
||||
other => passthrough_api_error(&node, other),
|
||||
}
|
||||
})
|
||||
.map(|res| (tenant_shard_id.shard_number, res))
|
||||
}
|
||||
|
||||
// no shard needs to go first/last; the operation should be idempotent
|
||||
// TODO: it would be great to ensure that all shards return the same error
|
||||
let mut results = self
|
||||
.tenant_for_shards(targets, |tenant_shard_id, node| {
|
||||
futures::FutureExt::boxed(detach_one(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
node,
|
||||
self.config.jwt_token.clone(),
|
||||
))
|
||||
})
|
||||
.await?;
|
||||
|
||||
let any = results.pop().expect("we must have at least one response");
|
||||
|
||||
let mismatching = results
|
||||
.iter()
|
||||
.filter(|(_, res)| res != &any.1)
|
||||
.collect::<Vec<_>>();
|
||||
if !mismatching.is_empty() {
|
||||
let matching = results.len() - mismatching.len();
|
||||
tracing::error!(
|
||||
matching,
|
||||
compared_against=?any,
|
||||
?mismatching,
|
||||
"shards returned different results"
|
||||
);
|
||||
}
|
||||
|
||||
Ok(any.1)
|
||||
}
|
||||
|
||||
/// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
|
||||
///
|
||||
/// On success, the returned vector contains exactly the same number of elements as the input `locations`.
|
||||
@@ -2894,8 +3003,8 @@ impl Service {
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
|
||||
))
|
||||
"Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
@@ -3847,6 +3956,8 @@ impl Service {
|
||||
"failpoint".to_string()
|
||||
)));
|
||||
|
||||
failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
|
||||
|
||||
tracing::info!(
|
||||
"Split {} into {}",
|
||||
parent_id,
|
||||
|
||||
@@ -34,6 +34,7 @@ camino.workspace = true
|
||||
rustls.workspace = true
|
||||
rustls-native-certs.workspace = true
|
||||
once_cell.workspace = true
|
||||
storage_controller_client.workspace = true
|
||||
|
||||
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
|
||||
chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
|
||||
|
||||
@@ -24,6 +24,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::ValueEnum;
|
||||
use pageserver::tenant::TENANTS_SEGMENT_NAME;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::RemotePath;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::AsyncReadExt;
|
||||
@@ -31,7 +32,7 @@ use tracing::error;
|
||||
use tracing_appender::non_blocking::WorkerGuard;
|
||||
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
|
||||
use utils::fs_ext;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
const MAX_RETRIES: usize = 20;
|
||||
const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
|
||||
@@ -54,7 +55,7 @@ pub struct S3Target {
|
||||
/// in the pageserver, as all timeline objects existing in the scope of a particular
|
||||
/// tenant: the scrubber is different in that it handles collections of data referring to many
|
||||
/// TenantShardTimelineIds in on place.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct TenantShardTimelineId {
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -67,6 +68,10 @@ impl TenantShardTimelineId {
|
||||
timeline_id,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_tenant_timeline_id(&self) -> TenantTimelineId {
|
||||
TenantTimelineId::new(self.tenant_shard_id.tenant_id, self.timeline_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for TenantShardTimelineId {
|
||||
@@ -179,6 +184,22 @@ impl RootTarget {
|
||||
.with_sub_segment(&id.timeline_id.to_string())
|
||||
}
|
||||
|
||||
/// Given RemotePath "tenants/foo/timelines/bar/layerxyz", prefix it to a literal
|
||||
/// key in the S3 bucket.
|
||||
pub fn absolute_key(&self, key: &RemotePath) -> String {
|
||||
let root = match self {
|
||||
Self::Pageserver(root) => root,
|
||||
Self::Safekeeper(root) => root,
|
||||
};
|
||||
|
||||
let prefix = &root.prefix_in_bucket;
|
||||
if prefix.ends_with('/') {
|
||||
format!("{prefix}{key}")
|
||||
} else {
|
||||
format!("{prefix}/{key}")
|
||||
}
|
||||
}
|
||||
|
||||
pub fn bucket_name(&self) -> &str {
|
||||
match self {
|
||||
Self::Pageserver(root) => &root.bucket_name,
|
||||
@@ -216,6 +237,14 @@ impl BucketConfig {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ControllerClientConfig {
|
||||
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
|
||||
pub controller_api: Url,
|
||||
|
||||
/// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'.
|
||||
pub controller_jwt: String,
|
||||
}
|
||||
|
||||
pub struct ConsoleConfig {
|
||||
pub token: String,
|
||||
pub base_url: Url,
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
use anyhow::bail;
|
||||
use anyhow::{anyhow, bail};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use storage_scrubber::find_large_objects;
|
||||
use reqwest::Url;
|
||||
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
||||
use storage_scrubber::pageserver_physical_gc::GcMode;
|
||||
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
|
||||
use storage_scrubber::tenant_snapshot::SnapshotDownloader;
|
||||
use storage_scrubber::{find_large_objects, ControllerClientConfig};
|
||||
use storage_scrubber::{
|
||||
init_logging, pageserver_physical_gc::pageserver_physical_gc,
|
||||
scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
|
||||
@@ -24,6 +25,14 @@ struct Cli {
|
||||
|
||||
#[arg(short, long, default_value_t = false)]
|
||||
delete: bool,
|
||||
|
||||
#[arg(long)]
|
||||
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
|
||||
controller_api: Option<Url>,
|
||||
|
||||
#[arg(long)]
|
||||
/// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'.
|
||||
controller_jwt: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
@@ -204,8 +213,37 @@ async fn main() -> anyhow::Result<()> {
|
||||
min_age,
|
||||
mode,
|
||||
} => {
|
||||
let summary =
|
||||
pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?;
|
||||
let controller_client_conf = cli.controller_api.map(|controller_api| {
|
||||
ControllerClientConfig {
|
||||
controller_api,
|
||||
// Default to no key: this is a convenience when working in a development environment
|
||||
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
|
||||
}
|
||||
});
|
||||
|
||||
match (&controller_client_conf, mode) {
|
||||
(Some(_), _) => {
|
||||
// Any mode may run when controller API is set
|
||||
}
|
||||
(None, GcMode::Full) => {
|
||||
// The part of physical GC where we erase ancestor layers cannot be done safely without
|
||||
// confirming the most recent complete shard split with the controller. Refuse to run, rather
|
||||
// than doing it unsafely.
|
||||
return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run"));
|
||||
}
|
||||
(None, GcMode::DryRun | GcMode::IndicesOnly) => {
|
||||
// These GcModes do not require the controller to run.
|
||||
}
|
||||
}
|
||||
|
||||
let summary = pageserver_physical_gc(
|
||||
bucket_config,
|
||||
controller_client_conf,
|
||||
tenant_ids,
|
||||
min_age.into(),
|
||||
mode,
|
||||
)
|
||||
.await?;
|
||||
println!("{}", serde_json::to_string(&summary).unwrap());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,22 +1,50 @@
|
||||
use std::time::{Duration, UNIX_EPOCH};
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use crate::checks::{list_timeline_blobs, BlobDataParseResult};
|
||||
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
|
||||
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
|
||||
use crate::{
|
||||
init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId,
|
||||
};
|
||||
use aws_sdk_s3::Client;
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::controller_api::TenantDescribeResponse;
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use remote_storage::RemotePath;
|
||||
use reqwest::Method;
|
||||
use serde::Serialize;
|
||||
use storage_controller_client::control_api;
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TenantTimelineId};
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct GcSummary {
|
||||
indices_deleted: usize,
|
||||
remote_storage_errors: usize,
|
||||
controller_api_errors: usize,
|
||||
ancestor_layers_deleted: usize,
|
||||
}
|
||||
|
||||
impl GcSummary {
|
||||
fn merge(&mut self, other: Self) {
|
||||
let Self {
|
||||
indices_deleted,
|
||||
remote_storage_errors,
|
||||
ancestor_layers_deleted,
|
||||
controller_api_errors,
|
||||
} = other;
|
||||
|
||||
self.indices_deleted += indices_deleted;
|
||||
self.remote_storage_errors += remote_storage_errors;
|
||||
self.ancestor_layers_deleted += ancestor_layers_deleted;
|
||||
self.controller_api_errors += controller_api_errors;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(clap::ValueEnum, Debug, Clone, Copy)]
|
||||
@@ -26,9 +54,9 @@ pub enum GcMode {
|
||||
|
||||
// Enable only removing old-generation indices
|
||||
IndicesOnly,
|
||||
|
||||
// Enable all forms of GC
|
||||
// TODO: this will be used when shard split ancestor layer deletion is added
|
||||
// All,
|
||||
Full,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for GcMode {
|
||||
@@ -36,10 +64,232 @@ impl std::fmt::Display for GcMode {
|
||||
match self {
|
||||
GcMode::DryRun => write!(f, "dry-run"),
|
||||
GcMode::IndicesOnly => write!(f, "indices-only"),
|
||||
GcMode::Full => write!(f, "full"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod refs {
|
||||
use super::*;
|
||||
// Map of cross-shard layer references, giving a refcount for each layer in each shard that is referenced by some other
|
||||
// shard in the same tenant. This is sparse! The vast majority of timelines will have no cross-shard refs, and those that
|
||||
// do have cross shard refs should eventually drop most of them via compaction.
|
||||
//
|
||||
// In our inner map type, the TTID in the key is shard-agnostic, and the ShardIndex in the value refers to the _ancestor
|
||||
// which is is referenced_.
|
||||
#[derive(Default)]
|
||||
pub(super) struct AncestorRefs(
|
||||
BTreeMap<TenantTimelineId, HashMap<(ShardIndex, LayerName), usize>>,
|
||||
);
|
||||
|
||||
impl AncestorRefs {
|
||||
/// Insert references for layers discovered in a particular shard-timeline that refer to an ancestral shard-timeline.
|
||||
pub(super) fn update(
|
||||
&mut self,
|
||||
ttid: TenantShardTimelineId,
|
||||
layers: Vec<(LayerName, LayerFileMetadata)>,
|
||||
) {
|
||||
let ttid_refs = self.0.entry(ttid.as_tenant_timeline_id()).or_default();
|
||||
for (layer_name, layer_metadata) in layers {
|
||||
// Increment refcount of this layer in the ancestor shard
|
||||
*(ttid_refs
|
||||
.entry((layer_metadata.shard, layer_name))
|
||||
.or_default()) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// For a particular TTID, return the map of all ancestor layers referenced by a descendent to their refcount
|
||||
///
|
||||
/// The `ShardIndex` in the result's key is the index of the _ancestor_, not the descendent.
|
||||
pub(super) fn get_ttid_refcounts(
|
||||
&self,
|
||||
ttid: &TenantTimelineId,
|
||||
) -> Option<&HashMap<(ShardIndex, LayerName), usize>> {
|
||||
self.0.get(ttid)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
use refs::AncestorRefs;
|
||||
|
||||
// As we see shards for a tenant, acccumulate knowledge needed for cross-shard GC:
|
||||
// - Are there any ancestor shards?
|
||||
// - Are there any refs to ancestor shards' layers?
|
||||
#[derive(Default)]
|
||||
struct TenantRefAccumulator {
|
||||
shards_seen: HashMap<TenantId, Vec<ShardIndex>>,
|
||||
|
||||
// For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to
|
||||
ancestor_ref_shards: AncestorRefs,
|
||||
}
|
||||
|
||||
impl TenantRefAccumulator {
|
||||
fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) {
|
||||
let this_shard_idx = ttid.tenant_shard_id.to_index();
|
||||
(*self
|
||||
.shards_seen
|
||||
.entry(ttid.tenant_shard_id.tenant_id)
|
||||
.or_default())
|
||||
.push(this_shard_idx);
|
||||
|
||||
let mut ancestor_refs = Vec::new();
|
||||
for (layer_name, layer_metadata) in &index_part.layer_metadata {
|
||||
if layer_metadata.shard != this_shard_idx {
|
||||
// This is a reference from this shard to a layer in an ancestor shard: we must track this
|
||||
// as a marker to not GC this layer from the parent.
|
||||
ancestor_refs.push((layer_name.clone(), layer_metadata.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
if !ancestor_refs.is_empty() {
|
||||
tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len());
|
||||
self.ancestor_ref_shards.update(ttid, ancestor_refs);
|
||||
}
|
||||
}
|
||||
|
||||
/// Consume Self and return a vector of ancestor tenant shards that should be GC'd, and map of referenced ancestor layers to preserve
|
||||
async fn into_gc_ancestors(
|
||||
self,
|
||||
controller_client: &control_api::Client,
|
||||
summary: &mut GcSummary,
|
||||
) -> (Vec<TenantShardId>, AncestorRefs) {
|
||||
let mut ancestors_to_gc = Vec::new();
|
||||
for (tenant_id, mut shard_indices) in self.shards_seen {
|
||||
// Find the highest shard count
|
||||
let latest_count = shard_indices
|
||||
.iter()
|
||||
.map(|i| i.shard_count)
|
||||
.max()
|
||||
.expect("Always at least one shard");
|
||||
|
||||
let (mut latest_shards, ancestor_shards) = {
|
||||
let at =
|
||||
itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count);
|
||||
(shard_indices[0..at].to_owned(), &shard_indices[at..])
|
||||
};
|
||||
// Sort shards, as we will later compare them with a sorted list from the controller
|
||||
latest_shards.sort();
|
||||
|
||||
// Check that we have a complete view of the latest shard count: this should always be the case unless we happened
|
||||
// to scan the S3 bucket halfway through a shard split.
|
||||
if latest_shards.len() != latest_count.count() as usize {
|
||||
// This should be extremely rare, so we warn on it.
|
||||
tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if we have any non-latest-count shards
|
||||
if ancestor_shards.is_empty() {
|
||||
tracing::debug!(%tenant_id, "No ancestor shards to clean up");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Based on S3 view, this tenant looks like it might have some ancestor shard work to do. We
|
||||
// must only do this work if the tenant is not currently being split: otherwise, it is not safe
|
||||
// to GC ancestors, because if the split fails then the controller will try to attach ancestor
|
||||
// shards again.
|
||||
match controller_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
// We were not able to learn the latest shard split state from the controller, so we will not
|
||||
// do ancestor GC on this tenant.
|
||||
tracing::warn!(%tenant_id, "Failed to query storage controller, will not do ancestor GC: {e}");
|
||||
summary.controller_api_errors += 1;
|
||||
continue;
|
||||
}
|
||||
Ok(desc) => {
|
||||
// We expect to see that the latest shard count matches the one we saw in S3, and that none
|
||||
// of the shards indicate splitting in progress.
|
||||
|
||||
let controller_indices: Vec<ShardIndex> = desc
|
||||
.shards
|
||||
.iter()
|
||||
.map(|s| s.tenant_shard_id.to_index())
|
||||
.collect();
|
||||
if controller_indices != latest_shards {
|
||||
tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})");
|
||||
continue;
|
||||
}
|
||||
|
||||
if desc.shards.iter().any(|s| s.is_splitting) {
|
||||
tracing::info!(%tenant_id, "One or more shards is currently splitting");
|
||||
continue;
|
||||
}
|
||||
|
||||
// This shouldn't be too noisy, because we only log this for tenants that have some ancestral refs.
|
||||
tracing::info!(%tenant_id, "Validated state with controller: {desc:?}");
|
||||
}
|
||||
}
|
||||
|
||||
// GC ancestor shards
|
||||
for ancestor_shard in ancestor_shards.iter().map(|idx| TenantShardId {
|
||||
tenant_id,
|
||||
shard_count: idx.shard_count,
|
||||
shard_number: idx.shard_number,
|
||||
}) {
|
||||
ancestors_to_gc.push(ancestor_shard);
|
||||
}
|
||||
}
|
||||
|
||||
(ancestors_to_gc, self.ancestor_ref_shards)
|
||||
}
|
||||
}
|
||||
|
||||
async fn is_old_enough(
|
||||
s3_client: &Client,
|
||||
bucket_config: &BucketConfig,
|
||||
min_age: &Duration,
|
||||
key: &str,
|
||||
summary: &mut GcSummary,
|
||||
) -> bool {
|
||||
// Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident
|
||||
// it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects.
|
||||
let age: Duration = match s3_client
|
||||
.head_object()
|
||||
.bucket(&bucket_config.bucket)
|
||||
.key(key)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(response) => match response.last_modified {
|
||||
None => {
|
||||
tracing::warn!("Missing last_modified");
|
||||
summary.remote_storage_errors += 1;
|
||||
return false;
|
||||
}
|
||||
Some(last_modified) => match SystemTime::try_from(last_modified).map(|t| t.elapsed()) {
|
||||
Ok(Ok(e)) => e,
|
||||
Err(_) | Ok(Err(_)) => {
|
||||
tracing::warn!("Bad last_modified time: {last_modified:?}");
|
||||
return false;
|
||||
}
|
||||
},
|
||||
},
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to HEAD {key}: {e}");
|
||||
summary.remote_storage_errors += 1;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
let old_enough = &age > min_age;
|
||||
|
||||
if !old_enough {
|
||||
tracing::info!(
|
||||
"Skipping young object {} < {}",
|
||||
humantime::format_duration(age),
|
||||
humantime::format_duration(*min_age)
|
||||
);
|
||||
}
|
||||
|
||||
old_enough
|
||||
}
|
||||
|
||||
async fn maybe_delete_index(
|
||||
s3_client: &Client,
|
||||
bucket_config: &BucketConfig,
|
||||
@@ -79,45 +329,7 @@ async fn maybe_delete_index(
|
||||
return;
|
||||
}
|
||||
|
||||
// Validation: we will only delete indices after one week, so that during incidents we will have
|
||||
// easy access to recent indices.
|
||||
let age: Duration = match s3_client
|
||||
.head_object()
|
||||
.bucket(&bucket_config.bucket)
|
||||
.key(key)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(response) => match response.last_modified {
|
||||
None => {
|
||||
tracing::warn!("Missing last_modified");
|
||||
summary.remote_storage_errors += 1;
|
||||
return;
|
||||
}
|
||||
Some(last_modified) => {
|
||||
let last_modified =
|
||||
UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64());
|
||||
match last_modified.elapsed() {
|
||||
Ok(e) => e,
|
||||
Err(_) => {
|
||||
tracing::warn!("Bad last_modified time: {last_modified:?}");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to HEAD {key}: {e}");
|
||||
summary.remote_storage_errors += 1;
|
||||
return;
|
||||
}
|
||||
};
|
||||
if &age < min_age {
|
||||
tracing::info!(
|
||||
"Skipping young object {} < {}",
|
||||
age.as_secs_f64(),
|
||||
min_age.as_secs_f64()
|
||||
);
|
||||
if !is_old_enough(s3_client, bucket_config, min_age, key, summary).await {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -145,6 +357,108 @@ async fn maybe_delete_index(
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn gc_ancestor(
|
||||
s3_client: &Client,
|
||||
bucket_config: &BucketConfig,
|
||||
root_target: &RootTarget,
|
||||
min_age: &Duration,
|
||||
ancestor: TenantShardId,
|
||||
refs: &AncestorRefs,
|
||||
mode: GcMode,
|
||||
summary: &mut GcSummary,
|
||||
) -> anyhow::Result<()> {
|
||||
// Scan timelines in the ancestor
|
||||
let timelines = stream_tenant_timelines(s3_client, root_target, ancestor).await?;
|
||||
let mut timelines = std::pin::pin!(timelines);
|
||||
|
||||
// Build a list of keys to retain
|
||||
|
||||
while let Some(ttid) = timelines.next().await {
|
||||
let ttid = ttid?;
|
||||
|
||||
let data = list_timeline_blobs(s3_client, ttid, root_target).await?;
|
||||
|
||||
let s3_layers = match data.blob_data {
|
||||
BlobDataParseResult::Parsed {
|
||||
index_part: _,
|
||||
index_part_generation: _,
|
||||
s3_layers,
|
||||
} => s3_layers,
|
||||
BlobDataParseResult::Relic => {
|
||||
// Post-deletion tenant location: don't try and GC it.
|
||||
continue;
|
||||
}
|
||||
BlobDataParseResult::Incorrect(reasons) => {
|
||||
// Our primary purpose isn't to report on bad data, but log this rather than skipping silently
|
||||
tracing::warn!(
|
||||
"Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let ttid_refs = refs.get_ttid_refcounts(&ttid.as_tenant_timeline_id());
|
||||
let ancestor_shard_index = ttid.tenant_shard_id.to_index();
|
||||
|
||||
for (layer_name, layer_gen) in s3_layers {
|
||||
let ref_count = ttid_refs
|
||||
.and_then(|m| m.get(&(ancestor_shard_index, layer_name.clone())))
|
||||
.copied()
|
||||
.unwrap_or(0);
|
||||
|
||||
if ref_count > 0 {
|
||||
tracing::debug!(%ttid, "Ancestor layer {layer_name} has {ref_count} refs");
|
||||
continue;
|
||||
}
|
||||
|
||||
tracing::info!(%ttid, "Ancestor layer {layer_name} is not referenced");
|
||||
|
||||
// Build the key for the layer we are considering deleting
|
||||
let key = root_target.absolute_key(&remote_layer_path(
|
||||
&ttid.tenant_shard_id.tenant_id,
|
||||
&ttid.timeline_id,
|
||||
ancestor_shard_index,
|
||||
&layer_name,
|
||||
layer_gen,
|
||||
));
|
||||
|
||||
// We apply a time threshold to GCing objects that are un-referenced: this preserves our ability
|
||||
// to roll back a shard split if we have to, by avoiding deleting ancestor layers right away
|
||||
if !is_old_enough(s3_client, bucket_config, min_age, &key, summary).await {
|
||||
continue;
|
||||
}
|
||||
|
||||
if !matches!(mode, GcMode::Full) {
|
||||
tracing::info!("Dry run: would delete key {key}");
|
||||
continue;
|
||||
}
|
||||
|
||||
// All validations passed: erase the object
|
||||
match s3_client
|
||||
.delete_object()
|
||||
.bucket(&bucket_config.bucket)
|
||||
.key(&key)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(_) => {
|
||||
tracing::info!("Successfully deleted unreferenced ancestor layer {key}");
|
||||
summary.ancestor_layers_deleted += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to delete layer {key}: {e}");
|
||||
summary.remote_storage_errors += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: if all the layers are gone, clean up the whole timeline dir (remove index)
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Physical garbage collection: removing unused S3 objects. This is distinct from the garbage collection
|
||||
/// done inside the pageserver, which operates at a higher level (keys, layers). This type of garbage collection
|
||||
/// is about removing:
|
||||
@@ -156,22 +470,26 @@ async fn maybe_delete_index(
|
||||
/// make sure that object listings don't get slowed down by large numbers of garbage objects.
|
||||
pub async fn pageserver_physical_gc(
|
||||
bucket_config: BucketConfig,
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
controller_client_conf: Option<ControllerClientConfig>,
|
||||
tenant_shard_ids: Vec<TenantShardId>,
|
||||
min_age: Duration,
|
||||
mode: GcMode,
|
||||
) -> anyhow::Result<GcSummary> {
|
||||
let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
|
||||
|
||||
let tenants = if tenant_ids.is_empty() {
|
||||
let tenants = if tenant_shard_ids.is_empty() {
|
||||
futures::future::Either::Left(stream_tenants(&s3_client, &target))
|
||||
} else {
|
||||
futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
|
||||
futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
|
||||
};
|
||||
|
||||
// How many tenants to process in parallel. We need to be mindful of pageservers
|
||||
// accessing the same per tenant prefixes, so use a lower setting than pageservers.
|
||||
const CONCURRENCY: usize = 32;
|
||||
|
||||
// Accumulate information about each tenant for cross-shard GC step we'll do at the end
|
||||
let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
|
||||
|
||||
// Generate a stream of TenantTimelineId
|
||||
let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
|
||||
let timelines = timelines.try_buffered(CONCURRENCY);
|
||||
@@ -185,16 +503,17 @@ pub async fn pageserver_physical_gc(
|
||||
target: &RootTarget,
|
||||
mode: GcMode,
|
||||
ttid: TenantShardTimelineId,
|
||||
accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
|
||||
) -> anyhow::Result<GcSummary> {
|
||||
let mut summary = GcSummary::default();
|
||||
let data = list_timeline_blobs(s3_client, ttid, target).await?;
|
||||
|
||||
let (latest_gen, candidates) = match &data.blob_data {
|
||||
let (index_part, latest_gen, candidates) = match &data.blob_data {
|
||||
BlobDataParseResult::Parsed {
|
||||
index_part: _index_part,
|
||||
index_part,
|
||||
index_part_generation,
|
||||
s3_layers: _s3_layers,
|
||||
} => (*index_part_generation, data.unused_index_keys),
|
||||
} => (index_part, *index_part_generation, data.unused_index_keys),
|
||||
BlobDataParseResult::Relic => {
|
||||
// Post-deletion tenant location: don't try and GC it.
|
||||
return Ok(summary);
|
||||
@@ -206,6 +525,8 @@ pub async fn pageserver_physical_gc(
|
||||
}
|
||||
};
|
||||
|
||||
accumulator.lock().unwrap().update(ttid, index_part);
|
||||
|
||||
for key in candidates {
|
||||
maybe_delete_index(
|
||||
s3_client,
|
||||
@@ -222,17 +543,61 @@ pub async fn pageserver_physical_gc(
|
||||
|
||||
Ok(summary)
|
||||
}
|
||||
let timelines = timelines
|
||||
.map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid));
|
||||
let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
|
||||
|
||||
let mut summary = GcSummary::default();
|
||||
|
||||
while let Some(i) = timelines.next().await {
|
||||
let tl_summary = i?;
|
||||
// Drain futures for per-shard GC, populating accumulator as a side effect
|
||||
{
|
||||
let timelines = timelines.map_ok(|ttid| {
|
||||
gc_timeline(
|
||||
&s3_client,
|
||||
&bucket_config,
|
||||
&min_age,
|
||||
&target,
|
||||
mode,
|
||||
ttid,
|
||||
&accumulator,
|
||||
)
|
||||
});
|
||||
let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
|
||||
|
||||
summary.indices_deleted += tl_summary.indices_deleted;
|
||||
summary.remote_storage_errors += tl_summary.remote_storage_errors;
|
||||
while let Some(i) = timelines.next().await {
|
||||
summary.merge(i?);
|
||||
}
|
||||
}
|
||||
|
||||
// Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
|
||||
let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
|
||||
let ControllerClientConfig {
|
||||
controller_api,
|
||||
controller_jwt,
|
||||
} = c;
|
||||
control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
|
||||
}) else {
|
||||
tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
|
||||
return Ok(summary);
|
||||
};
|
||||
|
||||
let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator)
|
||||
.unwrap()
|
||||
.into_inner()
|
||||
.unwrap()
|
||||
.into_gc_ancestors(&controller_client, &mut summary)
|
||||
.await;
|
||||
|
||||
for ancestor_shard in ancestor_shards {
|
||||
gc_ancestor(
|
||||
&s3_client,
|
||||
&bucket_config,
|
||||
&target,
|
||||
&min_age,
|
||||
ancestor_shard,
|
||||
&ancestor_refs,
|
||||
mode,
|
||||
&mut summary,
|
||||
)
|
||||
.instrument(info_span!("gc_ancestor", %ancestor_shard))
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(summary)
|
||||
|
||||
@@ -146,6 +146,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_smgr_query_seconds_sum",
|
||||
"pageserver_archive_size",
|
||||
"pageserver_pitr_history_size",
|
||||
"pageserver_layer_bytes",
|
||||
"pageserver_layer_count",
|
||||
"pageserver_storage_operations_seconds_count_total",
|
||||
"pageserver_storage_operations_seconds_sum_total",
|
||||
"pageserver_evictions_total",
|
||||
|
||||
@@ -261,3 +261,47 @@ class NeonAPI:
|
||||
if op["status"] in {"scheduling", "running", "cancelling"}:
|
||||
has_running = True
|
||||
time.sleep(0.5)
|
||||
|
||||
|
||||
class NeonApiEndpoint:
|
||||
def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]):
|
||||
self.neon_api = neon_api
|
||||
if project_id is None:
|
||||
project = neon_api.create_project(pg_version)
|
||||
neon_api.wait_for_operation_to_finish(project["project"]["id"])
|
||||
self.project_id = project["project"]["id"]
|
||||
self.endpoint_id = project["endpoints"][0]["id"]
|
||||
self.connstr = project["connection_uris"][0]["connection_uri"]
|
||||
self.pgbench_env = connection_parameters_to_env(
|
||||
project["connection_uris"][0]["connection_parameters"]
|
||||
)
|
||||
self.is_new = True
|
||||
else:
|
||||
project = neon_api.get_project_details(project_id)
|
||||
if int(project["project"]["pg_version"]) != int(pg_version):
|
||||
raise Exception(
|
||||
f"A project with the provided ID exists, but it's not of the specified version (expected {pg_version}, got {project['project']['pg_version']})"
|
||||
)
|
||||
self.project_id = project_id
|
||||
eps = neon_api.get_endpoints(project_id)["endpoints"]
|
||||
self.endpoint_id = eps[0]["id"]
|
||||
self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[
|
||||
"uri"
|
||||
]
|
||||
pw = self.connstr.split("@")[0].split(":")[-1]
|
||||
self.pgbench_env = {
|
||||
"PGHOST": eps[0]["host"],
|
||||
"PGDATABASE": "neondb",
|
||||
"PGUSER": "neondb_owner",
|
||||
"PGPASSWORD": pw,
|
||||
}
|
||||
self.is_new = False
|
||||
|
||||
def restart(self):
|
||||
self.neon_api.restart_endpoint(self.project_id, self.endpoint_id)
|
||||
self.neon_api.wait_for_operation_to_finish(self.project_id)
|
||||
|
||||
def get_synthetic_storage_size(self) -> int:
|
||||
return int(
|
||||
self.neon_api.get_project_details(self.project_id)["project"]["synthetic_storage_size"]
|
||||
)
|
||||
|
||||
@@ -31,6 +31,7 @@ import backoff
|
||||
import httpx
|
||||
import jwt
|
||||
import psycopg2
|
||||
import psycopg2.sql
|
||||
import pytest
|
||||
import requests
|
||||
import toml
|
||||
@@ -87,7 +88,7 @@ from fixtures.utils import (
|
||||
)
|
||||
from fixtures.utils import AuxFileStore as AuxFileStore # reexport
|
||||
|
||||
from .neon_api import NeonAPI
|
||||
from .neon_api import NeonAPI, NeonApiEndpoint
|
||||
|
||||
"""
|
||||
This file contains pytest fixtures. A fixture is a test resource that can be
|
||||
@@ -727,8 +728,30 @@ class NeonEnvBuilder:
|
||||
self.repo_dir / "local_fs_remote_storage",
|
||||
)
|
||||
|
||||
if (attachments_json := Path(repo_dir / "attachments.json")).exists():
|
||||
shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name)
|
||||
# restore storage controller (the db is small, don't bother with overlayfs)
|
||||
storcon_db_from_dir = repo_dir / "storage_controller_db"
|
||||
storcon_db_to_dir = self.repo_dir / "storage_controller_db"
|
||||
log.info(f"Copying storage_controller_db from {storcon_db_from_dir} to {storcon_db_to_dir}")
|
||||
assert storcon_db_from_dir.is_dir()
|
||||
assert not storcon_db_to_dir.exists()
|
||||
|
||||
def ignore_postgres_log(path: str, _names):
|
||||
if Path(path) == storcon_db_from_dir:
|
||||
return {"postgres.log"}
|
||||
return set()
|
||||
|
||||
shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log)
|
||||
assert not (storcon_db_to_dir / "postgres.log").exists()
|
||||
# NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
|
||||
# However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller
|
||||
# will currently reject re-attach requests from them because the NodeMetadata isn't identical.
|
||||
# So, from_repo_dir patches up the the storcon database.
|
||||
patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
|
||||
assert not patch_script_path.exists()
|
||||
patch_script = ""
|
||||
for ps in self.env.pageservers:
|
||||
patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';"
|
||||
patch_script_path.write_text(patch_script)
|
||||
|
||||
# Update the config with info about tenants and timelines
|
||||
with (self.repo_dir / "config").open("r") as f:
|
||||
@@ -974,7 +997,7 @@ class NeonEnvBuilder:
|
||||
|
||||
if self.scrub_on_exit:
|
||||
try:
|
||||
StorageScrubber(self).scan_metadata()
|
||||
self.env.storage_scrubber.scan_metadata()
|
||||
except Exception as e:
|
||||
log.error(f"Error during remote storage scrub: {e}")
|
||||
cleanup_error = e
|
||||
@@ -1135,6 +1158,7 @@ class NeonEnv:
|
||||
"listen_http_addr": f"localhost:{pageserver_port.http}",
|
||||
"pg_auth_type": pg_auth_type,
|
||||
"http_auth_type": http_auth_type,
|
||||
"image_compression": "zstd",
|
||||
}
|
||||
if self.pageserver_virtual_file_io_engine is not None:
|
||||
ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
|
||||
@@ -1201,6 +1225,9 @@ class NeonEnv:
|
||||
)
|
||||
cfg["safekeepers"].append(sk_cfg)
|
||||
|
||||
# Scrubber instance for tests that use it, and for use during teardown checks
|
||||
self.storage_scrubber = StorageScrubber(self, log_dir=config.test_output_dir)
|
||||
|
||||
log.info(f"Config: {cfg}")
|
||||
self.neon_cli.init(
|
||||
cfg,
|
||||
@@ -2400,7 +2427,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
|
||||
def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
|
||||
"""
|
||||
:return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
|
||||
:return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int}
|
||||
"""
|
||||
response = self.request(
|
||||
"GET",
|
||||
@@ -2786,8 +2813,8 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
)
|
||||
return client.tenant_attach(
|
||||
tenant_id,
|
||||
generation,
|
||||
config,
|
||||
generation=generation,
|
||||
)
|
||||
|
||||
def tenant_detach(self, tenant_id: TenantId):
|
||||
@@ -3158,6 +3185,18 @@ class RemotePostgres(PgProtocol):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def benchmark_project_pub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint:
|
||||
project_id = os.getenv("BENCHMARK_PROJECT_ID_PUB")
|
||||
return NeonApiEndpoint(neon_api, pg_version, project_id)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def benchmark_project_sub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint:
|
||||
project_id = os.getenv("BENCHMARK_PROJECT_ID_SUB")
|
||||
return NeonApiEndpoint(neon_api, pg_version, project_id)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def remote_pg(
|
||||
test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion
|
||||
@@ -3773,12 +3812,12 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers
|
||||
)
|
||||
|
||||
def respec(self, **kwargs):
|
||||
def respec(self, **kwargs: Any) -> None:
|
||||
"""Update the endpoint.json file used by control_plane."""
|
||||
# Read config
|
||||
config_path = os.path.join(self.endpoint_path(), "endpoint.json")
|
||||
with open(config_path, "r") as f:
|
||||
data_dict = json.load(f)
|
||||
data_dict: dict[str, Any] = json.load(f)
|
||||
|
||||
# Write it back updated
|
||||
with open(config_path, "w") as file:
|
||||
@@ -3786,13 +3825,13 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
json.dump(dict(data_dict, **kwargs), file, indent=4)
|
||||
|
||||
# Please note: Migrations only run if pg_skip_catalog_updates is false
|
||||
def wait_for_migrations(self):
|
||||
def wait_for_migrations(self, num_migrations: int = 10):
|
||||
with self.cursor() as cur:
|
||||
|
||||
def check_migrations_done():
|
||||
cur.execute("SELECT id FROM neon_migration.migration_id")
|
||||
migration_id = cur.fetchall()[0][0]
|
||||
assert migration_id != 0
|
||||
migration_id: int = cur.fetchall()[0][0]
|
||||
assert migration_id >= num_migrations
|
||||
|
||||
wait_until(20, 0.5, check_migrations_done)
|
||||
|
||||
@@ -4042,6 +4081,22 @@ class Safekeeper(LogUtils):
|
||||
self.id = id
|
||||
self.running = running
|
||||
self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
|
||||
|
||||
if extra_opts is None:
|
||||
# Testing defaults: enable everything, and set short timeouts so that background
|
||||
# work will happen during short tests.
|
||||
# **Note**: Any test that explicitly sets extra_opts will not get these defaults.
|
||||
extra_opts = [
|
||||
"--enable-offload",
|
||||
"--delete-offloaded-wal",
|
||||
"--partial-backup-timeout",
|
||||
"10s",
|
||||
"--control-file-save-interval",
|
||||
"1s",
|
||||
"--eviction-min-resident",
|
||||
"10s",
|
||||
]
|
||||
|
||||
self.extra_opts = extra_opts
|
||||
|
||||
def start(
|
||||
@@ -4213,9 +4268,9 @@ class Safekeeper(LogUtils):
|
||||
|
||||
|
||||
class StorageScrubber:
|
||||
def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
|
||||
def __init__(self, env: NeonEnv, log_dir: Path):
|
||||
self.env = env
|
||||
self.log_dir = log_dir or env.test_output_dir
|
||||
self.log_dir = log_dir
|
||||
|
||||
def scrubber_cli(self, args: list[str], timeout) -> str:
|
||||
assert isinstance(self.env.pageserver_remote_storage, S3Storage)
|
||||
@@ -4232,11 +4287,14 @@ class StorageScrubber:
|
||||
if s3_storage.endpoint is not None:
|
||||
env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
|
||||
|
||||
base_args = [str(self.env.neon_binpath / "storage_scrubber")]
|
||||
base_args = [
|
||||
str(self.env.neon_binpath / "storage_scrubber"),
|
||||
f"--controller-api={self.env.storage_controller_api}",
|
||||
]
|
||||
args = base_args + args
|
||||
|
||||
(output_path, stdout, status_code) = subprocess_capture(
|
||||
self.env.test_output_dir,
|
||||
self.log_dir,
|
||||
args,
|
||||
echo_stderr=True,
|
||||
echo_stdout=True,
|
||||
@@ -4275,7 +4333,10 @@ class StorageScrubber:
|
||||
log.info(f"tenant-snapshot output: {stdout}")
|
||||
|
||||
def pageserver_physical_gc(
|
||||
self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None
|
||||
self,
|
||||
min_age_secs: int,
|
||||
tenant_ids: Optional[list[TenantId]] = None,
|
||||
mode: Optional[str] = None,
|
||||
):
|
||||
args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"]
|
||||
|
||||
@@ -4285,6 +4346,9 @@ class StorageScrubber:
|
||||
for tenant_id in tenant_ids:
|
||||
args.extend(["--tenant-id", str(tenant_id)])
|
||||
|
||||
if mode is not None:
|
||||
args.extend(["--mode", mode])
|
||||
|
||||
stdout = self.scrubber_cli(
|
||||
args,
|
||||
timeout=30,
|
||||
|
||||
@@ -117,6 +117,9 @@ class LayerMapInfo:
|
||||
def image_layers(self) -> List[HistoricLayerInfo]:
|
||||
return [x for x in self.historic_layers if x.kind == "Image"]
|
||||
|
||||
def delta_l0_layers(self) -> List[HistoricLayerInfo]:
|
||||
return [x for x in self.historic_layers if x.kind == "Delta" and x.l0]
|
||||
|
||||
def historic_by_name(self) -> Set[str]:
|
||||
return set(x.layer_file_name for x in self.historic_layers)
|
||||
|
||||
@@ -172,6 +175,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
if auth_token is not None:
|
||||
self.headers["Authorization"] = f"Bearer {auth_token}"
|
||||
|
||||
def without_status_retrying(self) -> PageserverHttpClient:
|
||||
retries = Retry(
|
||||
status=0,
|
||||
connect=5,
|
||||
read=False,
|
||||
backoff_factor=0.2,
|
||||
status_forcelist=[],
|
||||
allowed_methods=None,
|
||||
remove_headers_on_redirect=[],
|
||||
)
|
||||
|
||||
return PageserverHttpClient(
|
||||
self.port, self.is_testing_enabled_or_skip, self.auth_token, retries
|
||||
)
|
||||
|
||||
@property
|
||||
def base_url(self) -> str:
|
||||
return f"http://localhost:{self.port}"
|
||||
@@ -223,8 +241,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
def tenant_attach(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
generation: int,
|
||||
config: None | Dict[str, Any] = None,
|
||||
generation: Optional[int] = None,
|
||||
):
|
||||
config = config or {}
|
||||
|
||||
@@ -814,17 +832,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: TimelineId,
|
||||
batch_size: int | None = None,
|
||||
) -> Set[TimelineId]:
|
||||
**kwargs,
|
||||
) -> List[TimelineId]:
|
||||
params = {}
|
||||
if batch_size is not None:
|
||||
params["batch_size"] = batch_size
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor",
|
||||
params=params,
|
||||
**kwargs,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
json = res.json()
|
||||
return set(map(TimelineId, json["reparented_timelines"]))
|
||||
return list(map(TimelineId, json["reparented_timelines"]))
|
||||
|
||||
def evict_layer(
|
||||
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
|
||||
|
||||
@@ -255,11 +255,3 @@ def run_pagebench_benchmark(
|
||||
unit="ms",
|
||||
report=MetricReport.LOWER_IS_BETTER,
|
||||
)
|
||||
|
||||
env.storage_controller.allowed_errors.append(
|
||||
# The test setup swaps NeonEnv instances, hence different
|
||||
# pg instances are used for the storage controller db. This means
|
||||
# the storage controller doesn't know about the nodes mentioned
|
||||
# in attachments.json at start-up.
|
||||
".* Scheduler missing node 1",
|
||||
)
|
||||
|
||||
@@ -2,6 +2,7 @@ from contextlib import closing
|
||||
|
||||
import pytest
|
||||
from fixtures.compare_fixtures import NeonCompare
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import wait_for_last_flush_lsn
|
||||
|
||||
|
||||
@@ -56,3 +57,98 @@ def test_compaction(neon_compare: NeonCompare):
|
||||
pageserver_http.timeline_compact(tenant_id, timeline_id)
|
||||
|
||||
neon_compare.report_size()
|
||||
|
||||
|
||||
def test_compaction_l0_memory(neon_compare: NeonCompare):
|
||||
"""
|
||||
Generate a large stack of L0s pending compaction into L1s, and
|
||||
measure the pageserver's peak RSS while doing so
|
||||
"""
|
||||
|
||||
env = neon_compare.env
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
# Initially disable compaction so that we will build up a stack of L0s
|
||||
"compaction_period": "0s",
|
||||
"gc_period": "0s",
|
||||
}
|
||||
)
|
||||
neon_compare.tenant = tenant_id
|
||||
neon_compare.timeline = timeline_id
|
||||
|
||||
endpoint = env.endpoints.create_start(
|
||||
"main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"]
|
||||
)
|
||||
|
||||
# Read tenant effective config and assert on checkpoint_distance and compaction_threshold,
|
||||
# as we do want to test with defaults (to be same as the field), but this test's workload size makes assumptions about them.
|
||||
#
|
||||
# If these assertions fail, it probably means we changed the default.
|
||||
tenant_conf = pageserver_http.tenant_config(tenant_id)
|
||||
assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024
|
||||
assert tenant_conf.effective_config["compaction_threshold"] == 10
|
||||
|
||||
# Aim to write about 20 L0s, so that we will hit the limit on how many
|
||||
# to compact at once
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
for i in range(200):
|
||||
cur.execute(f"create table tbl{i} (i int, j int);")
|
||||
cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);")
|
||||
for j in range(100):
|
||||
cur.execute(f"update tbl{i} set j = {j};")
|
||||
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
endpoint.stop()
|
||||
|
||||
# Check we have generated the L0 stack we expected
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
initial_l0s = len(layers.delta_l0_layers())
|
||||
initial_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers())
|
||||
log.info(f"l0s before compaction {initial_l0s} ({initial_l0s_size})")
|
||||
|
||||
def rss_hwm():
|
||||
v = pageserver_http.get_metric_value("libmetrics_maxrss_kb")
|
||||
assert v is not None
|
||||
assert v > 0
|
||||
return v * 1024
|
||||
|
||||
before = rss_hwm()
|
||||
pageserver_http.timeline_compact(tenant_id, timeline_id)
|
||||
after = rss_hwm()
|
||||
|
||||
log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})")
|
||||
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
final_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers())
|
||||
log.info(f"l0s after compaction {len(layers.delta_l0_layers())} ({final_l0s_size})")
|
||||
|
||||
assert after > before # If we didn't use some memory the test is probably buggy
|
||||
compaction_mapped_rss = after - before
|
||||
|
||||
# During L0 compaction, we require as much memory as the physical size of what we compacted, and then some,
|
||||
# because the key->value mapping in L0s compaction is exhaustive, non-streaming, and does not de-duplicate
|
||||
# repeated references to the same key.
|
||||
#
|
||||
# To be fixed in https://github.com/neondatabase/neon/issues/8184, after which
|
||||
# this memory estimate can be revised far downwards to something that doesn't scale
|
||||
# linearly with the layer sizes.
|
||||
MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25
|
||||
|
||||
# If we find that compaction is using more memory, this may indicate a regression
|
||||
assert compaction_mapped_rss < MEMORY_ESTIMATE
|
||||
|
||||
# If we find that compaction is using <0.5 the expected memory then:
|
||||
# - maybe we made a big efficiency improvement, in which case update the test
|
||||
# - maybe something is functionally wrong with the test and it's not driving the system as expected
|
||||
assert compaction_mapped_rss > MEMORY_ESTIMATE / 2
|
||||
|
||||
# We should have compacted some but not all of the l0s, based on the limit on how much
|
||||
# l0 to compact in one go
|
||||
assert len(layers.delta_l0_layers()) > 0
|
||||
assert len(layers.delta_l0_layers()) < initial_l0s
|
||||
|
||||
# The pageserver should have logged when it hit the compaction size limit
|
||||
env.pageserver.assert_log_contains(".*hit max delta layer size limit.*")
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import traceback
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import psycopg2
|
||||
@@ -10,15 +9,12 @@ import pytest
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_api import connection_parameters_to_env
|
||||
from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.benchmark_fixture import NeonBenchmarker
|
||||
from fixtures.neon_api import NeonAPI
|
||||
from fixtures.neon_api import NeonApiEndpoint
|
||||
from fixtures.neon_fixtures import NeonEnv, PgBin
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
|
||||
@@ -86,8 +82,8 @@ def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
|
||||
@pytest.mark.timeout(2 * 60 * 60)
|
||||
def test_subscriber_lag(
|
||||
pg_bin: PgBin,
|
||||
neon_api: NeonAPI,
|
||||
pg_version: PgVersion,
|
||||
benchmark_project_pub: NeonApiEndpoint,
|
||||
benchmark_project_sub: NeonApiEndpoint,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
):
|
||||
"""
|
||||
@@ -99,125 +95,82 @@ def test_subscriber_lag(
|
||||
sync_interval_min = 5
|
||||
pgbench_duration = f"-T{test_duration_min * 60 * 2}"
|
||||
|
||||
pub_project = neon_api.create_project(pg_version)
|
||||
pub_project_id = pub_project["project"]["id"]
|
||||
neon_api.wait_for_operation_to_finish(pub_project_id)
|
||||
error_occurred = False
|
||||
pub_env = benchmark_project_pub.pgbench_env
|
||||
sub_env = benchmark_project_sub.pgbench_env
|
||||
pub_connstr = benchmark_project_pub.connstr
|
||||
sub_connstr = benchmark_project_sub.connstr
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
|
||||
|
||||
pub_conn = psycopg2.connect(pub_connstr)
|
||||
sub_conn = psycopg2.connect(sub_connstr)
|
||||
pub_conn.autocommit = True
|
||||
sub_conn.autocommit = True
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
if benchmark_project_pub.is_new:
|
||||
pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
|
||||
|
||||
if benchmark_project_sub.is_new:
|
||||
sub_cur.execute("truncate table pgbench_accounts")
|
||||
sub_cur.execute("truncate table pgbench_history")
|
||||
|
||||
sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
|
||||
|
||||
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
pub_conn.close()
|
||||
sub_conn.close()
|
||||
|
||||
zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
pub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
|
||||
)
|
||||
try:
|
||||
sub_project = neon_api.create_project(pg_version)
|
||||
sub_project_id = sub_project["project"]["id"]
|
||||
sub_endpoint_id = sub_project["endpoints"][0]["id"]
|
||||
neon_api.wait_for_operation_to_finish(sub_project_id)
|
||||
sub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-S"],
|
||||
env=sub_env,
|
||||
)
|
||||
try:
|
||||
pub_env = connection_parameters_to_env(
|
||||
pub_project["connection_uris"][0]["connection_parameters"]
|
||||
)
|
||||
sub_env = connection_parameters_to_env(
|
||||
sub_project["connection_uris"][0]["connection_parameters"]
|
||||
)
|
||||
pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
|
||||
sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
|
||||
start = time.time()
|
||||
while time.time() - start < test_duration_min * 60:
|
||||
time.sleep(sync_interval_min * 60)
|
||||
check_pgbench_still_running(pub_workload, "pub")
|
||||
check_pgbench_still_running(sub_workload, "sub")
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
|
||||
with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
|
||||
sub_connstr
|
||||
) as sub_conn:
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
|
||||
pub_conn = psycopg2.connect(pub_connstr)
|
||||
sub_conn = psycopg2.connect(sub_connstr)
|
||||
pub_conn.autocommit = True
|
||||
sub_conn.autocommit = True
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
sub_cur.execute("truncate table pgbench_accounts")
|
||||
sub_cur.execute("truncate table pgbench_history")
|
||||
log.info(f"Replica lagged behind master by {lag} seconds")
|
||||
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
sub_workload.terminate()
|
||||
benchmark_project_sub.restart()
|
||||
|
||||
pub_cur.execute(
|
||||
"create publication pub1 for table pgbench_accounts, pgbench_history"
|
||||
)
|
||||
sub_cur.execute(
|
||||
f"create subscription sub1 connection '{pub_connstr}' publication pub1"
|
||||
)
|
||||
|
||||
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
pub_conn.close()
|
||||
sub_conn.close()
|
||||
|
||||
zenbenchmark.record(
|
||||
"initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
pub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
|
||||
)
|
||||
try:
|
||||
sub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-S"],
|
||||
env=sub_env,
|
||||
)
|
||||
try:
|
||||
start = time.time()
|
||||
while time.time() - start < test_duration_min * 60:
|
||||
time.sleep(sync_interval_min * 60)
|
||||
check_pgbench_still_running(pub_workload, "pub")
|
||||
check_pgbench_still_running(sub_workload, "sub")
|
||||
|
||||
with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
|
||||
sub_connstr
|
||||
) as sub_conn:
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
|
||||
log.info(f"Replica lagged behind master by {lag} seconds")
|
||||
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
sub_workload.terminate()
|
||||
neon_api.restart_endpoint(
|
||||
sub_project_id,
|
||||
sub_endpoint_id,
|
||||
)
|
||||
neon_api.wait_for_operation_to_finish(sub_project_id)
|
||||
sub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-S"],
|
||||
env=sub_env,
|
||||
)
|
||||
|
||||
# Measure storage to make sure replication information isn't bloating storage
|
||||
sub_storage = neon_api.get_project_details(sub_project_id)["project"][
|
||||
"synthetic_storage_size"
|
||||
]
|
||||
pub_storage = neon_api.get_project_details(pub_project_id)["project"][
|
||||
"synthetic_storage_size"
|
||||
]
|
||||
zenbenchmark.record(
|
||||
"sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
zenbenchmark.record(
|
||||
"pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
finally:
|
||||
sub_workload.terminate()
|
||||
finally:
|
||||
pub_workload.terminate()
|
||||
except Exception as e:
|
||||
error_occurred = True
|
||||
log.error(f"Caught exception {e}")
|
||||
log.error(traceback.format_exc())
|
||||
# Measure storage to make sure replication information isn't bloating storage
|
||||
sub_storage = benchmark_project_sub.get_synthetic_storage_size()
|
||||
pub_storage = benchmark_project_pub.get_synthetic_storage_size()
|
||||
zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER)
|
||||
zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER)
|
||||
finally:
|
||||
if not error_occurred:
|
||||
neon_api.delete_project(sub_project_id)
|
||||
except Exception as e:
|
||||
error_occurred = True
|
||||
log.error(f"Caught exception {e}")
|
||||
log.error(traceback.format_exc())
|
||||
sub_workload.terminate()
|
||||
finally:
|
||||
assert not error_occurred
|
||||
neon_api.delete_project(pub_project_id)
|
||||
pub_workload.terminate()
|
||||
|
||||
|
||||
@pytest.mark.remote_cluster
|
||||
@pytest.mark.timeout(2 * 60 * 60)
|
||||
def test_publisher_restart(
|
||||
pg_bin: PgBin,
|
||||
neon_api: NeonAPI,
|
||||
pg_version: PgVersion,
|
||||
benchmark_project_pub: NeonApiEndpoint,
|
||||
benchmark_project_sub: NeonApiEndpoint,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
):
|
||||
"""
|
||||
@@ -229,114 +182,70 @@ def test_publisher_restart(
|
||||
sync_interval_min = 5
|
||||
pgbench_duration = f"-T{test_duration_min * 60 * 2}"
|
||||
|
||||
pub_project = neon_api.create_project(pg_version)
|
||||
pub_project_id = pub_project["project"]["id"]
|
||||
pub_endpoint_id = pub_project["endpoints"][0]["id"]
|
||||
neon_api.wait_for_operation_to_finish(pub_project_id)
|
||||
error_occurred = False
|
||||
pub_env = benchmark_project_pub.pgbench_env
|
||||
sub_env = benchmark_project_sub.pgbench_env
|
||||
pub_connstr = benchmark_project_pub.connstr
|
||||
sub_connstr = benchmark_project_sub.connstr
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
|
||||
|
||||
pub_conn = psycopg2.connect(pub_connstr)
|
||||
sub_conn = psycopg2.connect(sub_connstr)
|
||||
pub_conn.autocommit = True
|
||||
sub_conn.autocommit = True
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
if benchmark_project_pub.is_new:
|
||||
pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
|
||||
|
||||
if benchmark_project_sub.is_new:
|
||||
sub_cur.execute("truncate table pgbench_accounts")
|
||||
sub_cur.execute("truncate table pgbench_history")
|
||||
|
||||
sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
|
||||
|
||||
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
pub_conn.close()
|
||||
sub_conn.close()
|
||||
|
||||
zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
pub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
|
||||
)
|
||||
try:
|
||||
sub_project = neon_api.create_project(pg_version)
|
||||
sub_project_id = sub_project["project"]["id"]
|
||||
neon_api.wait_for_operation_to_finish(sub_project_id)
|
||||
sub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-S"],
|
||||
env=sub_env,
|
||||
)
|
||||
try:
|
||||
pub_env = connection_parameters_to_env(
|
||||
pub_project["connection_uris"][0]["connection_parameters"]
|
||||
)
|
||||
sub_env = connection_parameters_to_env(
|
||||
sub_project["connection_uris"][0]["connection_parameters"]
|
||||
)
|
||||
pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
|
||||
sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
|
||||
start = time.time()
|
||||
while time.time() - start < test_duration_min * 60:
|
||||
time.sleep(sync_interval_min * 60)
|
||||
check_pgbench_still_running(pub_workload, "pub")
|
||||
check_pgbench_still_running(sub_workload, "sub")
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
|
||||
|
||||
pub_conn = psycopg2.connect(pub_connstr)
|
||||
sub_conn = psycopg2.connect(sub_connstr)
|
||||
pub_conn.autocommit = True
|
||||
sub_conn.autocommit = True
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
sub_cur.execute("truncate table pgbench_accounts")
|
||||
sub_cur.execute("truncate table pgbench_history")
|
||||
|
||||
pub_cur.execute(
|
||||
"create publication pub1 for table pgbench_accounts, pgbench_history"
|
||||
)
|
||||
sub_cur.execute(
|
||||
f"create subscription sub1 connection '{pub_connstr}' publication pub1"
|
||||
)
|
||||
|
||||
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
pub_conn.close()
|
||||
sub_conn.close()
|
||||
|
||||
zenbenchmark.record(
|
||||
"initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
pub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
|
||||
)
|
||||
try:
|
||||
sub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-S"],
|
||||
env=sub_env,
|
||||
)
|
||||
try:
|
||||
start = time.time()
|
||||
while time.time() - start < test_duration_min * 60:
|
||||
time.sleep(sync_interval_min * 60)
|
||||
check_pgbench_still_running(pub_workload, "pub")
|
||||
check_pgbench_still_running(sub_workload, "sub")
|
||||
|
||||
pub_workload.terminate()
|
||||
neon_api.restart_endpoint(
|
||||
pub_project_id,
|
||||
pub_endpoint_id,
|
||||
)
|
||||
neon_api.wait_for_operation_to_finish(pub_project_id)
|
||||
pub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"],
|
||||
env=pub_env,
|
||||
)
|
||||
with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
|
||||
sub_connstr
|
||||
) as sub_conn:
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
|
||||
log.info(f"Replica lagged behind master by {lag} seconds")
|
||||
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
# Measure storage to make sure replication information isn't bloating storage
|
||||
sub_storage = neon_api.get_project_details(sub_project_id)["project"][
|
||||
"synthetic_storage_size"
|
||||
]
|
||||
pub_storage = neon_api.get_project_details(pub_project_id)["project"][
|
||||
"synthetic_storage_size"
|
||||
]
|
||||
zenbenchmark.record(
|
||||
"sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
zenbenchmark.record(
|
||||
"pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
finally:
|
||||
sub_workload.terminate()
|
||||
finally:
|
||||
pub_workload.terminate()
|
||||
except Exception as e:
|
||||
error_occurred = True
|
||||
log.error(f"Caught exception {e}")
|
||||
log.error(traceback.format_exc())
|
||||
benchmark_project_pub.restart()
|
||||
pub_workload = pg_bin.run_nonblocking(
|
||||
["pgbench", "-c10", pgbench_duration, "-Mprepared"],
|
||||
env=pub_env,
|
||||
)
|
||||
with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
|
||||
sub_connstr
|
||||
) as sub_conn:
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
|
||||
log.info(f"Replica lagged behind master by {lag} seconds")
|
||||
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
# Measure storage to make sure replication information isn't bloating storage
|
||||
sub_storage = benchmark_project_sub.get_synthetic_storage_size()
|
||||
pub_storage = benchmark_project_pub.get_synthetic_storage_size()
|
||||
zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER)
|
||||
zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER)
|
||||
finally:
|
||||
if not error_occurred:
|
||||
neon_api.delete_project(sub_project_id)
|
||||
except Exception as e:
|
||||
error_occurred = True
|
||||
log.error(f"Caught exception {e}")
|
||||
log.error(traceback.format_exc())
|
||||
sub_workload.terminate()
|
||||
finally:
|
||||
assert not error_occurred
|
||||
neon_api.delete_project(pub_project_id)
|
||||
pub_workload.terminate()
|
||||
|
||||
@@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
def check_pageserver(expect_success: bool, **conn_kwargs):
|
||||
check_connection(
|
||||
env.pageserver,
|
||||
f"show {env.initial_tenant}",
|
||||
f"pagestream {env.initial_tenant} {env.initial_timeline}",
|
||||
expect_success,
|
||||
**conn_kwargs,
|
||||
)
|
||||
|
||||
@@ -65,8 +65,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
|
||||
"compaction_period": "1 s",
|
||||
"compaction_threshold": "2",
|
||||
"image_creation_threshold": "1",
|
||||
# set PITR interval to be small, so we can do GC
|
||||
"pitr_interval": "1 s",
|
||||
# Disable PITR, this test will set an explicit space-based GC limit
|
||||
"pitr_interval": "0 s",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -6,7 +6,10 @@ from typing import Optional
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
generate_uploads_and_deletions,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.workload import Workload
|
||||
@@ -142,6 +145,10 @@ def test_sharding_compaction(
|
||||
"image_layer_creation_check_threshold": 0,
|
||||
}
|
||||
|
||||
# Disable compression, as we can't estimate the size of layers with compression enabled
|
||||
# TODO: implement eager layer cutting during compaction
|
||||
neon_env_builder.pageserver_config_override = "image_compression='disabled'"
|
||||
|
||||
neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf=TENANT_CONF,
|
||||
@@ -320,3 +327,87 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
|
||||
or 0
|
||||
) == 0
|
||||
assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enabled", [True, False])
|
||||
def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool):
|
||||
tenant_conf = {
|
||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
||||
"checkpoint_distance": f"{128 * 1024}",
|
||||
"compaction_threshold": "1",
|
||||
"compaction_target_size": f"{128 * 1024}",
|
||||
# no PITR horizon, we specify the horizon when we request on-demand GC
|
||||
"pitr_interval": "0s",
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# create image layers as eagerly as possible
|
||||
"image_creation_threshold": "1",
|
||||
"image_layer_creation_check_threshold": "0",
|
||||
}
|
||||
|
||||
# Explicitly enable/disable compression, rather than using default
|
||||
if enabled:
|
||||
neon_env_builder.pageserver_config_override = "image_compression='zstd'"
|
||||
else:
|
||||
neon_env_builder.pageserver_config_override = "image_compression='disabled'"
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
pageserver = env.pageserver
|
||||
ps_http = env.pageserver.http_client()
|
||||
with env.endpoints.create_start(
|
||||
"main", tenant_id=tenant_id, pageserver_id=pageserver.id
|
||||
) as endpoint:
|
||||
endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
|
||||
# Generate around 800k worth of easily compressible data to store
|
||||
for v in range(100):
|
||||
endpoint.safe_psql(
|
||||
f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))"
|
||||
)
|
||||
# run compaction to create image layers
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
|
||||
|
||||
layer_map = ps_http.layer_map_info(tenant_id, timeline_id)
|
||||
image_layer_count = 0
|
||||
delta_layer_count = 0
|
||||
for layer in layer_map.historic_layers:
|
||||
if layer.kind == "Image":
|
||||
image_layer_count += 1
|
||||
elif layer.kind == "Delta":
|
||||
delta_layer_count += 1
|
||||
assert image_layer_count > 0
|
||||
assert delta_layer_count > 0
|
||||
|
||||
log.info(f"images: {image_layer_count}, deltas: {delta_layer_count}")
|
||||
|
||||
bytes_in = pageserver.http_client().get_metric_value(
|
||||
"pageserver_compression_image_in_bytes_total"
|
||||
)
|
||||
bytes_out = pageserver.http_client().get_metric_value(
|
||||
"pageserver_compression_image_out_bytes_total"
|
||||
)
|
||||
assert bytes_in is not None
|
||||
assert bytes_out is not None
|
||||
log.info(f"Compression ratio: {bytes_out/bytes_in} ({bytes_out} in, {bytes_out} out)")
|
||||
|
||||
if enabled:
|
||||
# We are writing high compressible repetitive plain text, expect excellent compression
|
||||
EXPECT_RATIO = 0.2
|
||||
assert bytes_out / bytes_in < EXPECT_RATIO
|
||||
else:
|
||||
# Nothing should be compressed if we disabled it.
|
||||
assert bytes_out >= bytes_in
|
||||
|
||||
# Destroy the endpoint and create a new one to resetthe caches
|
||||
with env.endpoints.create_start(
|
||||
"main", tenant_id=tenant_id, pageserver_id=pageserver.id
|
||||
) as endpoint:
|
||||
for v in range(100):
|
||||
res = endpoint.safe_psql(
|
||||
f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)"
|
||||
)
|
||||
assert res[0][0] == 1
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user