diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 16759ad038..d4029bd37c 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -9,8 +9,8 @@ inputs: description: 'Region ID, if not set the project will be created in the default region' default: aws-us-east-2 postgres_version: - description: 'Postgres version; default is 15' - default: '15' + description: 'Postgres version; default is 16' + default: '16' api_host: description: 'Neon API host' default: console-stage.neon.build diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index d038f64f15..c132b5b513 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -57,9 +57,10 @@ jobs: bench: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} strategy: + fail-fast: false matrix: include: - - DEFAULT_PG_VERSION: 14 + - DEFAULT_PG_VERSION: 16 PLATFORM: "neon-staging" region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} provisioner: 'k8s-pod' @@ -146,6 +147,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} replication-tests: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 14 @@ -190,6 +192,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -215,11 +218,14 @@ jobs: # Available platforms: # - neon-captest-new: Freshly created project (1 CU) # - neon-captest-freetier: Use freetier-sized compute (0.25 CU) + # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region + # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region # - neon-captest-reuse: Reusing existing project # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage env: RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }} + DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} runs-on: ubuntu-22.04 outputs: pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }} @@ -230,23 +236,33 @@ jobs: - name: Generate matrix for pgbench benchmark id: pgbench-compare-matrix run: | + region_id_default=${{ env.DEFAULT_REGION_ID }} matrix='{ + "pg_version" : [ + 16 + ], + "region_id" : [ + "'"$region_id_default"'" + ], "platform": [ "neon-captest-new", "neon-captest-reuse", "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, - { "platform": "neon-captest-new", "db_size": "50gb" }, - { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, - { "platform": "neonvm-captest-new", "db_size": "50gb" }, - { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] + "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] }' if [ "$(date +%A)" = "Saturday" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, - { "platform": "rds-aurora", "db_size": "50gb"}]') + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}, + { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "50gb"}]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -298,7 +314,7 @@ jobs: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -323,14 +339,14 @@ jobs: prefix: latest - name: Create Neon Project - if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform) + if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: - region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + region_id: ${{ matrix.region_id }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }} + compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }} provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }} - name: Set up Connection String @@ -343,7 +359,7 @@ jobs: neonvm-captest-sharding-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} ;; - neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) + neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -368,6 +384,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -381,6 +398,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -394,6 +412,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -420,6 +439,13 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} pgbench-pgvector: + strategy: + fail-fast: false + matrix: + include: + - PLATFORM: "neon-captest-pgvector" + - PLATFORM: "azure-captest-pgvector" + env: TEST_PG_BENCH_DURATIONS_MATRIX: "15m" TEST_PG_BENCH_SCALES_MATRIX: "1" @@ -427,8 +453,9 @@ jobs: DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote + LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} - PLATFORM: "neon-captest-pgvector" + PLATFORM: ${{ matrix.PLATFORM }} runs-on: [ self-hosted, us-east-2, x64 ] container: @@ -438,17 +465,39 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Download Neon artifact - uses: ./.github/actions/download - with: - name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact - path: /tmp/neon/ - prefix: latest + # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16 + # instead of using Neon artifacts containing pgbench + - name: Install postgresql-16 where pytest expects it + run: | + cd /home/nonroot + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb + dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg + mkdir -p /tmp/neon/pg_install/v16/bin + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql + ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib + /tmp/neon/pg_install/v16/bin/pgbench --version + /tmp/neon/pg_install/v16/bin/psql --version - name: Set up Connection String id: set-up-connstr run: | - CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} + case "${PLATFORM}" in + neon-captest-pgvector) + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} + ;; + azure-captest-pgvector) + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }} + ;; + *) + echo >&2 "Unknown PLATFORM=${PLATFORM}" + exit 1 + ;; + esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT @@ -460,6 +509,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -473,6 +523,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -487,11 +538,10 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - clickbench-compare: # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters # we use for performance testing in pgbench-compare. @@ -735,6 +785,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_user_examples + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" diff --git a/Cargo.lock b/Cargo.lock index bab0b4dd1f..2505d4d3ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1368,6 +1368,7 @@ dependencies = [ "tracing", "url", "utils", + "whoami", "workspace_hack", ] @@ -3233,16 +3234,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "nu-ansi-term" -version = "0.46.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" -dependencies = [ - "overload", - "winapi", -] - [[package]] name = "num" version = "0.4.1" @@ -3538,12 +3529,6 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" -[[package]] -name = "overload" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" - [[package]] name = "p256" version = "0.11.1" @@ -4404,6 +4389,7 @@ dependencies = [ "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", + "typed-json", "url", "urlencoding", "utils", @@ -4602,6 +4588,15 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "regex" version = "1.10.2" @@ -5811,6 +5806,28 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "storage_controller_client" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bytes", + "futures", + "pageserver_api", + "pageserver_client", + "postgres", + "reqwest 0.12.4", + "serde", + "thiserror", + "tokio", + "tokio-postgres", + "tokio-stream", + "tokio-util", + "utils", + "workspace_hack", +] + [[package]] name = "storage_scrubber" version = "0.1.0" @@ -5845,6 +5862,7 @@ dependencies = [ "serde", "serde_json", "serde_with", + "storage_controller_client", "thiserror", "tokio", "tokio-postgres", @@ -5874,6 +5892,7 @@ dependencies = [ "reqwest 0.12.4", "serde", "serde_json", + "storage_controller_client", "thiserror", "tokio", "tracing", @@ -6600,7 +6619,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", - "nu-ansi-term", "once_cell", "regex", "serde", @@ -6665,6 +6683,16 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "typed-json" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "typenum" version = "1.16.0" @@ -6961,6 +6989,12 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + [[package]] name = "wasm-bindgen" version = "0.2.92" @@ -7113,6 +7147,17 @@ dependencies = [ "once_cell", ] +[[package]] +name = "whoami" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9" +dependencies = [ + "redox_syscall 0.4.1", + "wasite", + "web-sys", +] + [[package]] name = "winapi" version = "0.3.9" diff --git a/Cargo.toml b/Cargo.toml index 670e3241d5..615f5472ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ members = [ "safekeeper", "storage_broker", "storage_controller", + "storage_controller/client", "storage_scrubber", "workspace_hack", "libs/compute_api", @@ -182,14 +183,16 @@ tower-service = "0.3.2" tracing = "0.1" tracing-error = "0.2.0" tracing-opentelemetry = "0.21.0" -tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] } +tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } twox-hash = { version = "1.6.3", default-features = false } +typed-json = "0.1" url = "2.2" urlencoding = "2.1" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" rustls-native-certs = "0.7" x509-parser = "0.15" +whoami = "1.5.1" ## TODO replace this with tracing env_logger = "0.10" @@ -219,6 +222,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } desim = { version = "0.1", path = "./libs/desim" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. +storage_controller_client = { path = "./storage_controller/client" } tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" } tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" } utils = { version = "0.1", path = "./libs/utils/" } diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 7ab685625a..48a52bfc6d 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz FROM build-deps AS rum-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY patches/rum.patch /rum.patch + RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ + patch -p1 < /rum.patch && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 61dcf01c84..22ab145eda 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -9,6 +9,9 @@ pub(crate) struct MigrationRunner<'m> { impl<'m> MigrationRunner<'m> { pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self { + // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64 + assert!(migrations.len() + 1 < i64::MAX as usize); + Self { client, migrations } } @@ -22,11 +25,8 @@ impl<'m> MigrationRunner<'m> { Ok(row.get::<&str, i64>("id")) } - fn update_migration_id(&mut self) -> Result<()> { - let setval = format!( - "UPDATE neon_migration.migration_id SET id={}", - self.migrations.len() - ); + fn update_migration_id(&mut self, migration_id: i64) -> Result<()> { + let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id); self.client .simple_query(&setval) @@ -57,44 +57,49 @@ impl<'m> MigrationRunner<'m> { pub fn run_migrations(mut self) -> Result<()> { self.prepare_migrations()?; - let mut current_migration: usize = self.get_migration_id()? as usize; - let starting_migration_id = current_migration; - - let query = "BEGIN"; - self.client - .simple_query(query) - .context("run_migrations begin")?; - + let mut current_migration = self.get_migration_id()? as usize; while current_migration < self.migrations.len() { + macro_rules! migration_id { + ($cm:expr) => { + ($cm + 1) as i64 + }; + } + let migration = self.migrations[current_migration]; if migration.starts_with("-- SKIP") { - info!("Skipping migration id={}", current_migration); + info!("Skipping migration id={}", migration_id!(current_migration)); } else { info!( "Running migration id={}:\n{}\n", - current_migration, migration + migration_id!(current_migration), + migration ); + + self.client + .simple_query("BEGIN") + .context("begin migration")?; + self.client.simple_query(migration).with_context(|| { - format!("run_migration current_migration={}", current_migration) + format!( + "run_migrations migration id={}", + migration_id!(current_migration) + ) })?; + + // Migration IDs start at 1 + self.update_migration_id(migration_id!(current_migration))?; + + self.client + .simple_query("COMMIT") + .context("commit migration")?; + + info!("Finished migration id={}", migration_id!(current_migration)); } current_migration += 1; } - self.update_migration_id()?; - - let query = "COMMIT"; - self.client - .simple_query(query) - .context("run_migrations commit")?; - - info!( - "Ran {} migrations", - (self.migrations.len() - starting_migration_id) - ); - Ok(()) } } diff --git a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql similarity index 100% rename from compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql rename to compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql diff --git a/compute_tools/src/migrations/0001-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql similarity index 100% rename from compute_tools/src/migrations/0001-alter_roles.sql rename to compute_tools/src/migrations/0002-alter_roles.sql diff --git a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql rename to compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql rename to compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql rename to compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql rename to compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql similarity index 100% rename from compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql rename to compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql diff --git a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql similarity index 100% rename from compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql rename to compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql diff --git a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql similarity index 100% rename from compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql rename to compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql diff --git a/compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql rename to compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 1d12b88c7c..6a87263821 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -777,21 +777,21 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { // Add new migrations in numerical order. let migrations = [ - include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"), - include_str!("./migrations/0001-alter_roles.sql"), - include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"), - include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"), - include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"), - include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"), + include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"), + include_str!("./migrations/0002-alter_roles.sql"), + include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"), + include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"), + include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"), + include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"), include_str!( - "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql" + "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql" ), include_str!( - "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql" + "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql" ), - include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"), + include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"), include_str!( - "./migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql" + "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql" ), ]; diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index e62f3b8a47..487ac8f047 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -40,6 +40,7 @@ safekeeper_api.workspace = true postgres_connection.workspace = true storage_broker.workspace = true utils.workspace = true +whoami.workspace = true compute_api.workspace = true workspace_hack.workspace = true diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index c3cfc140da..c8ac5d8981 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -1,9 +1,9 @@ //! Code to manage the storage broker //! -//! In the local test environment, the data for each safekeeper is stored in +//! In the local test environment, the storage broker stores its data directly in //! //! ```text -//! .neon/safekeepers/ +//! .neon //! ``` use std::time::Duration; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 5f2373e95a..e3d1d0e110 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -1,8 +1,10 @@ //! Code to manage pageservers //! -//! In the local test environment, the pageserver stores its data directly in +//! In the local test environment, the data for each pageserver is stored in //! -//! .neon/ +//! ```text +//! .neon/pageserver_ +//! ``` //! use std::collections::HashMap; diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 47103a2e0a..d7aedd711a 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -29,7 +29,6 @@ use utils::{ pub struct StorageController { env: LocalEnv, listen: String, - path: Utf8PathBuf, private_key: Option>, public_key: Option, postgres_port: u16, @@ -41,6 +40,8 @@ const COMMAND: &str = "storage_controller"; const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; +const DB_NAME: &str = "storage_controller"; + #[derive(Serialize, Deserialize)] pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, @@ -65,10 +66,6 @@ pub struct InspectResponse { impl StorageController { pub fn from_env(env: &LocalEnv) -> Self { - let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone()) - .unwrap() - .join("attachments.json"); - // Makes no sense to construct this if pageservers aren't going to use it: assume // pageservers have control plane API set let listen_url = env.control_plane_api.clone().unwrap(); @@ -128,7 +125,6 @@ impl StorageController { Self { env: env.clone(), - path, listen, private_key, public_key, @@ -203,7 +199,6 @@ impl StorageController { /// /// Returns the database url pub async fn setup_database(&self) -> anyhow::Result { - const DB_NAME: &str = "storage_controller"; let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port); let pg_bin_dir = self.get_pg_bin_dir().await?; @@ -232,6 +227,30 @@ impl StorageController { Ok(database_url) } + pub async fn connect_to_database( + &self, + ) -> anyhow::Result<( + tokio_postgres::Client, + tokio_postgres::Connection, + )> { + tokio_postgres::Config::new() + .host("localhost") + .port(self.postgres_port) + // The user is the ambient operating system user name. + // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400 + // + // Until we get there, use the ambient operating system user name. + // Recent tokio-postgres versions default to this if the user isn't specified. + // But tokio-postgres fork doesn't have this upstream commit: + // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79 + // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399 + .user(&whoami::username()) + .dbname(DB_NAME) + .connect(tokio_postgres::NoTls) + .await + .map_err(anyhow::Error::new) + } + pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> { // Start a vanilla Postgres process used by the storage controller for persistence. let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone()) @@ -256,18 +275,21 @@ impl StorageController { if !status.success() { anyhow::bail!("initdb failed with status {status}"); } - - // Write a minimal config file: - // - Specify the port, since this is chosen dynamically - // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing - // the storage controller we don't want a slow local disk to interfere with that. - tokio::fs::write( - &pg_data_path.join("postgresql.conf"), - format!("port = {}\nfsync=off\n", self.postgres_port), - ) - .await?; }; + // Write a minimal config file: + // - Specify the port, since this is chosen dynamically + // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing + // the storage controller we don't want a slow local disk to interfere with that. + // + // NB: it's important that we rewrite this file on each start command so we propagate changes + // from `LocalEnv`'s config file (`.neon/config`). + tokio::fs::write( + &pg_data_path.join("postgresql.conf"), + format!("port = {}\nfsync=off\n", self.postgres_port), + ) + .await?; + println!("Starting storage controller database..."); let db_start_args = [ "-w", @@ -296,11 +318,38 @@ impl StorageController { // Run migrations on every startup, in case something changed. let database_url = self.setup_database().await?; + // We support running a startup SQL script to fiddle with the database before we launch storcon. + // This is used by the test suite. + let startup_script_path = self + .env + .base_data_dir + .join("storage_controller_db.startup.sql"); + let startup_script = match tokio::fs::read_to_string(&startup_script_path).await { + Ok(script) => { + tokio::fs::remove_file(startup_script_path).await?; + script + } + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + // always run some startup script so that this code path doesn't bit rot + "BEGIN; COMMIT;".to_string() + } else { + anyhow::bail!("Failed to read startup script: {e}") + } + } + }; + let (mut client, conn) = self.connect_to_database().await?; + let conn = tokio::spawn(conn); + let tx = client.build_transaction(); + let tx = tx.start().await?; + tx.batch_execute(&startup_script).await?; + tx.commit().await?; + drop(client); + conn.await??; + let mut args = vec![ "-l", &self.listen, - "-p", - self.path.as_ref(), "--dev", "--database-url", &database_url, diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml index f96f0084b2..be69208d0d 100644 --- a/control_plane/storcon_cli/Cargo.toml +++ b/control_plane/storcon_cli/Cargo.toml @@ -17,6 +17,7 @@ pageserver_client.workspace = true reqwest.workspace = true serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } +storage_controller_client.workspace = true thiserror.workspace = true tokio.workspace = true tracing.workspace = true diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 777a717a73..5c1add070a 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -14,15 +14,15 @@ use pageserver_api::{ }, shard::{ShardStripeSize, TenantShardId}, }; -use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; +use pageserver_client::mgmt_api::{self}; use reqwest::{Method, StatusCode, Url}; -use serde::{de::DeserializeOwned, Serialize}; use utils::id::{NodeId, TenantId}; use pageserver_api::controller_api::{ NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, TenantShardMigrateRequest, TenantShardMigrateResponse, }; +use storage_controller_client::control_api::Client; #[derive(Subcommand, Debug)] enum Command { @@ -249,64 +249,6 @@ impl FromStr for NodeAvailabilityArg { } } -struct Client { - base_url: Url, - jwt_token: Option, - client: reqwest::Client, -} - -impl Client { - fn new(base_url: Url, jwt_token: Option) -> Self { - Self { - base_url, - jwt_token, - client: reqwest::ClientBuilder::new() - .build() - .expect("Failed to construct http client"), - } - } - - /// Simple HTTP request wrapper for calling into storage controller - async fn dispatch( - &self, - method: Method, - path: String, - body: Option, - ) -> mgmt_api::Result - where - RQ: Serialize + Sized, - RS: DeserializeOwned + Sized, - { - // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out - // for general purpose API access. - let url = Url::from_str(&format!( - "http://{}:{}/{path}", - self.base_url.host_str().unwrap(), - self.base_url.port().unwrap() - )) - .unwrap(); - - let mut builder = self.client.request(method, url); - if let Some(body) = body { - builder = builder.json(&body) - } - if let Some(jwt_token) = &self.jwt_token { - builder = builder.header( - reqwest::header::AUTHORIZATION, - format!("Bearer {jwt_token}"), - ); - } - - let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?; - let response = response.error_from_body().await?; - - response - .json() - .await - .map_err(pageserver_client::mgmt_api::Error::ReceiveBody) - } -} - #[tokio::main] async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); diff --git a/docs/rfcs/034-ancestor-deletion.md b/docs/rfcs/034-ancestor-deletion.md new file mode 100644 index 0000000000..7341d930e2 --- /dev/null +++ b/docs/rfcs/034-ancestor-deletion.md @@ -0,0 +1,252 @@ +# Ancestor Timeline Deletion + +Created on: 2024-02-23 + +Author: John Spray + +# Summary + +When a tenant creates a new timeline that they will treat as their 'main' history, +it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently +this is necessary because it is forbidden to delete a timeline which has descendents. + +A new pageserver API is proposed to 'adopt' data from a parent timeline into +one of its children, such that the link between ancestor and child can be severed, +leaving the parent in a state where it may then be deleted. + +# Motivation + +Retaining parent timelines currently has two costs: + +- Cognitive load on users, who have to remember which is the "real" main timeline. +- Storage capacity cost, as the parent timeline will retain layers up to the + child's timeline point, even if the child fully covers its keyspace with image + layers and will never actually read from the parent. + +# Solution + +A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor` +will be added. The `timeline_id` in this URL is that of the _child_ timeline that we +wish to detach from its parent. + +On success, this API will leave the following state: + +- The detached child timeline will no longer have an ancestor, and will contain all + the data needed to service reads without recursing into an ancestor. +- Any other children of the parent whose timeline points were at a lower LSN than + the detached child timeline will be modified to have the child timeline as their + new parent. +- The parent timeline will still exist, but the child will no longer have it as an + ancestor. If this was the last timeline that depended on the parent, then the + parent will become deletable. + +This API's implementation will consist of a series of retryable steps, such that +on failures/timeout it can safely be called again to reach the target state. + +## Example + +### Before + +The user has "rolled back" their project to LSN X, resulting in a "new main" +timeline. The parent "old main" timeline still exists, and they would like +to clean it up. + +They have two other timelines A and B. A is from before the rollback point, +and B is from after the rollback point. + +``` +----"old main" timeline-------X--------------------------------------------> + | | | + |-> child A | | + |-> "new main" timeline | + -> child B + +``` + +### After calling detach ancestor API + +The "new main" timeline is no longer dependent on old main, and neither +is child A, because it had a branch point before X. + +The user may now choose to delete child B and "old main" to get to +a pristine state. Child B is likely to be unwanted since the user +chose to roll back to X, and it branches from after X. However, we +don't assume this in the API; it is up to the user to delete it. + +``` +|----"old main" timeline----------------------------------------------------> + | + | + | + -> child B + +|----"new main" timeline---------> + | + |-> child A + + +``` + +### After removing timelines + +We end up with a totally clean state that leaves no trace that a rollback +ever happened: there is only one root timeline. + +``` +| ----"new main" timeline-----------> + | + |-> child A + + +``` + +## Caveats + +Important things for API users to bear in mind: + +- this API does not delete the parent timeline: you must still do that explicitly. +- if there are other child timelines ahead of the branch point of the detached + child, the parent won't be deletable: you must either delete or detach those + children. +- do _not_ simply loop over all children and detach them all: this can have an + extremely high storage cost. The detach ancestor API is intended for use on a single + timeline to make it the new "main". +- The detach ancestor API should also not be + exposed directly to the user as button/API, because they might decide + to click it for all the children and thereby generate many copies of the + parent's data -- the detach ancestor API should be used as part + of a high level "clean up after rollback" feature. + +## `detach_ancestor` API implementation + +Terms used in the following sections: + +- "the child": the timeline whose ID is specified in the detach ancestor API URL, also + called "new main" in the example. +- "the parent": the parent of "the child". Also called "old main" in the example. +- "the branch point" the ancestor_lsn of "the child" + +### Phase 1: write out adopted layers to S3 + +The child will "adopt" layers from the parent, such that its end state contains +all the parent's history as well as its own. + +For all layers in the parent's layer map whose high LSN is below the branch +point, issue S3 CopyObject requests to duplicate them into the child timeline's +prefix. Do not add them to the child's layer map yet. + +For delta layers in the parent's layer map which straddle the branch point, read them +and write out only content up to the branch point into new layer objects. + +This is a long running operation if the parent has many layers: it should be +implemented in a way that resumes rather than restarting from scratch, if the API +times out and is called again. + +As an optimization, if there are no other timelines that will be adopted into +the child, _and_ the child's image layers already full cover the branch LSN, +then we may skip adopting layers. + +### Phase 2: update the child's index + +Having written out all needed layers in phase 1, atomically link them all +into the child's IndexPart and upload to S3. This may be done while the +child Timeline is still running. + +### Phase 3: modify timelines ancestry + +Modify the child's ancestor to None, and upload its IndexPart to persist the change. + +For all timelines which have the same parent as the child, and have a branch +point lower than our branch point, switch their ancestor_timeline to the child, +and upload their IndexPart to persist the change. + +## Alternatives considered + +### Generate full image layer on child, rather than adopting parent deltas + +This would work for the case of a single child, but would prevent re-targeting +other timelines that depended on the parent. If we detached many children this +way, the storage cost would become prohibitive (consider a 1TB database with +100 child timelines: it would cost 100TiB if they all generated their own image layers). + +### Don't rewrite anything: just fake it in the API + +We could add a layer of indirection that let a child "pretend" that it had no +ancestor, when in reality it still had the parent. The pageserver API could +accept deletion of ancestor timelines, and just update child metadata to make +them look like they have no ancestor. + +This would not achieve the desired reduction in storage cost, and may well be more +complex to maintain than simply implementing the API described in this RFC. + +### Avoid copying objects: enable child index to use parent layers directly + +We could teach IndexPart to store a TimelineId for each layer, such that a child +timeline could reference a parent's layers directly, rather than copying them +into the child's prefix. + +This would impose a cost for the normal case of indices that only target the +timeline's own layers, add complexity, and break the useful simplifying +invariant that timelines "own" their own path. If child timelines were +referencing layers from the parent, we would have to ensure that the parent +never runs GC/compaction again, which would make the API less flexible (the +proposal in this RFC enables deletion of the parent but doesn't require it.) + +## Performance + +### Adopting layers + +- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands + of such requests: this can take up to tens of seconds and will compete for RemoteStorage + semaphore units with other activity on the pageserver. +- If we are running on storage backend that doesn't implement CopyObject, then + this part will be much more expensive as we would stream all layer content + through the pageserver. This is no different to issuing a lot + of reads to a timeline that does not have a warm local cache: it will move + a lot of gigabytes, but that shouldn't break anything. +- Generating truncated layers for delta that straddle the branch point will + require streaming read/write of all the layers in question. + +### Updating timeline ancestry + +The simplest way to update timeline ancestry will probably be to stop and start +all the Timeline objects: this is preferable to the complexity of making their +ancestry mutable at runtime. + +There will be a corresponding "stutter" in the availability of the timelines, +of the order 10-100ms, which is the time taken to upload their IndexPart, and +restart the Timeline. + +# Interaction with other features + +## Concurrent timeline creation + +If new historic timelines are created using the parent as an ancestor while the +detach ancestor API is running, they will not be re-parented to the child. This +doesn't break anything, but it leaves the parent in a state where it might not +be possible to delete it. + +Since timeline creations are an explicit user action, this is not something we need to +worry about as the storage layer: a user who wants to delete their parent timeline will not create +new children, and if they do, they can choose to delete those children to +enable deleting the parent. + +For the least surprise to the user, before starting the detach ancestor branch +operation, the control plane should wait until all branches are created and not +allow any branches to be created before the branch point on the ancestor branch +while the operation is ongoing. + +## WAL based disaster recovery + +WAL based disaster recovery currently supports only restoring of the main +branch. Enabling WAL based disaster recovery in the future requires that we +keep a record which timeline generated the WAL and at which LSN was a parent +detached. Keep a list of timeline ids and the LSN in which they were detached in +the `index_part.json`. Limit the size of the list to 100 first entries, after +which the WAL disaster recovery will not be possible. + +## Sharded tenants + +For sharded tenants, calls to the detach ancestor API will pass through the storage +controller, which will handle them the same as timeline creations: invoke first +on shard zero, and then on all the other shards. diff --git a/docs/storage_controller.md b/docs/storage_controller.md index daf4d0c8b7..6d2ef929a4 100644 --- a/docs/storage_controller.md +++ b/docs/storage_controller.md @@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration: - Use `diesel migration generate ` to create a new migration - Populate the SQL files in the `migrations/` subdirectory - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically. - - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service` + - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller` - Commit the migration files and the changes to schema.rs - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again. - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed. diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index f05c1315ea..d0e1eb6b28 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -87,7 +87,7 @@ pub struct TenantLocateResponse { pub shard_params: ShardParameters, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Debug)] pub struct TenantDescribeResponse { pub tenant_id: TenantId, pub shards: Vec, @@ -110,7 +110,7 @@ pub struct NodeDescribeResponse { pub listen_pg_port: u16, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Debug)] pub struct TenantDescribeResponseShard { pub tenant_shard_id: TenantShardId, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 6abdcb88d0..231a604b47 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -651,6 +651,17 @@ pub struct TenantDetails { pub timelines: Vec, } +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)] +pub enum TimelineArchivalState { + Archived, + Unarchived, +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] +pub struct TimelineArchivalConfigRequest { + pub state: TimelineArchivalState, +} + /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs index fc1f10e734..ae5a21bab9 100644 --- a/libs/pageserver_api/src/models/detach_ancestor.rs +++ b/libs/pageserver_api/src/models/detach_ancestor.rs @@ -1,6 +1,6 @@ use utils::id::TimelineId; -#[derive(Default, serde::Serialize)] +#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)] pub struct AncestorDetached { pub reparented_timelines: Vec, } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index d440c03a0e..3381c4296f 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -443,7 +443,7 @@ impl GenericRemoteStorage> { } impl GenericRemoteStorage { - pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { + pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { let timeout = storage_config.timeout; Ok(match &storage_config.storage { RemoteStorageKind::LocalFs { local_path: path } => { @@ -458,7 +458,7 @@ impl GenericRemoteStorage { std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "".into()); info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?)) + Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?)) } RemoteStorageKind::AzureContainer(azure_config) => { let storage_account = azure_config diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index ef1bd2c047..b65d8b7e9e 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -16,16 +16,10 @@ use std::{ use anyhow::{anyhow, Context as _}; use aws_config::{ - environment::credentials::EnvironmentVariableCredentialsProvider, - imds::credentials::ImdsCredentialsProvider, - meta::credentials::CredentialsProviderChain, - profile::ProfileFileCredentialsProvider, - provider_config::ProviderConfig, + default_provider::credentials::DefaultCredentialsChain, retry::{RetryConfigBuilder, RetryMode}, - web_identity_token::WebIdentityTokenCredentialsProvider, BehaviorVersion, }; -use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::{ config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, @@ -76,40 +70,27 @@ struct GetObjectRequest { } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { + pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { tracing::debug!( "Creating s3 remote storage for S3 bucket {}", remote_storage_config.bucket_name ); - let region = Some(Region::new(remote_storage_config.bucket_region.clone())); + let region = Region::new(remote_storage_config.bucket_region.clone()); + let region_opt = Some(region.clone()); - let provider_conf = ProviderConfig::without_region().with_region(region.clone()); - - let credentials_provider = { - // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - CredentialsProviderChain::first_try( - "env", - EnvironmentVariableCredentialsProvider::new(), - ) - // uses "AWS_PROFILE" / `aws sso login --profile ` - .or_else( - "profile-sso", - ProfileFileCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" - // needed to access remote extensions bucket - .or_else( - "token", - WebIdentityTokenCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses imds v2 - .or_else("imds", ImdsCredentialsProvider::builder().build()) - }; + // https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html + // https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html + // Incomplete list of auth methods used by this: + // * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + // * "AWS_PROFILE" / `aws sso login --profile ` + // * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" + // * http (ECS/EKS) container credentials + // * imds v2 + let credentials_provider = DefaultCredentialsChain::builder() + .region(region) + .build() + .await; // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off let sleep_impl: Arc = Arc::new(TokioSleep::new()); @@ -118,9 +99,9 @@ impl S3Bucket { #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ BehaviorVersion::v2023_11_09(), ) - .region(region) + .region(region_opt) .identity_cache(IdentityCache::lazy().build()) - .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) + .credentials_provider(credentials_provider) .sleep_impl(SharedAsyncSleep::from(sleep_impl)); let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| { @@ -1041,8 +1022,8 @@ mod tests { use crate::{RemotePath, S3Bucket, S3Config}; - #[test] - fn relative_path() { + #[tokio::test] + async fn relative_path() { let all_paths = ["", "some/path", "some/path/"]; let all_paths: Vec = all_paths .iter() @@ -1085,8 +1066,9 @@ mod tests { max_keys_per_list_response: Some(5), upload_storage_class: None, }; - let storage = - S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init"); + let storage = S3Bucket::new(&config, std::time::Duration::ZERO) + .await + .expect("remote storage init"); for (test_path_idx, test_path) in all_paths.iter().enumerate() { let result = storage.relative_path_to_s3_object(test_path); let expected = expected_outputs[prefix_idx][test_path_idx]; diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 23628dfebe..3a20649490 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -31,6 +31,7 @@ struct EnabledAzure { impl EnabledAzure { async fn setup(max_keys_in_list_response: Option) -> Self { let client = create_azure_client(max_keys_in_list_response) + .await .context("Azure client creation") .expect("Azure client creation failed"); @@ -187,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { } } -fn create_azure_client( +async fn create_azure_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { use rand::Rng; @@ -221,6 +222,8 @@ fn create_azure_client( timeout: Duration::from_secs(120), }; Ok(Arc::new( - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?, )) } diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index a273abe867..342bc6da0b 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -197,6 +197,7 @@ struct EnabledS3 { impl EnabledS3 { async fn setup(max_keys_in_list_response: Option) -> Self { let client = create_s3_client(max_keys_in_list_response) + .await .context("S3 client creation") .expect("S3 client creation failed"); @@ -352,7 +353,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { } } -fn create_s3_client( +async fn create_s3_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { use rand::Rng; @@ -385,7 +386,9 @@ fn create_s3_client( timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; Ok(Arc::new( - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?, )) } diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 03e65f74fe..a1170a460d 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -33,6 +33,10 @@ pub enum Scope { GenerationsApi, // Allows access to control plane managment API and some storage controller endpoints. Admin, + + /// Allows access to storage controller APIs used by the scrubber, to interrogate the state + /// of a tenant & post scrub results. + Scrubber, } /// JWT payload. See docs/authentication.md for the format diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index e3ddb446fa..ac3ff1bb89 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use bytes::Bytes; +use detach_ancestor::AncestorDetached; use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method, StatusCode}; use utils::{ @@ -418,6 +419,23 @@ impl Client { } } + pub async fn timeline_detach_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor", + self.mgmt_api_endpoint + ); + + self.request(Method::PUT, &uri, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index ea09a011e5..3fabf62987 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> { .get("remote_storage") .expect("need remote_storage"); let config = RemoteStorageConfig::from_toml(toml_item)?; - let storage = remote_storage::GenericRemoteStorage::from_config(&config); + let storage = remote_storage::GenericRemoteStorage::from_config(&config).await; let cancel = CancellationToken::new(); storage .unwrap() diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs index 4785c8c4c5..9e3dedb75a 100644 --- a/pageserver/src/auth.rs +++ b/pageserver/src/auth.rs @@ -14,12 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope - (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError( - format!( - "JWT scope '{:?}' is ineligible for Pageserver auth", - claims.scope - ) - .into(), - )), + (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => { + Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Pageserver auth", + claims.scope + ) + .into(), + )) + } } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9f705f0bc9..ec1ceb54ce 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -385,7 +385,7 @@ fn start_pageserver( let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); // Set up remote storage client - let remote_storage = create_remote_storage_client(conf)?; + let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?; // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( @@ -622,7 +622,6 @@ fn start_pageserver( metric_collection_endpoint, &conf.metric_collection_bucket, conf.metric_collection_interval, - conf.cached_metric_collection_interval, conf.synthetic_size_calculation_interval, conf.id, local_disk_storage, @@ -702,7 +701,7 @@ fn start_pageserver( } } -fn create_remote_storage_client( +async fn create_remote_storage_client( conf: &'static PageServerConf, ) -> anyhow::Result { let config = if let Some(config) = &conf.remote_storage_config { @@ -712,7 +711,7 @@ fn create_remote_storage_client( }; // Create the client - let mut remote_storage = GenericRemoteStorage::from_config(config)?; + let mut remote_storage = GenericRemoteStorage::from_config(config).await?; // If `test_remote_failures` is non-zero, wrap the client with a // wrapper that simulates failures. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 5b103b551f..35b4e79365 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -68,7 +68,6 @@ pub mod defaults { super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; - pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s"; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; @@ -123,7 +122,6 @@ pub mod defaults { #concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}' #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}' -#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} @@ -238,7 +236,6 @@ pub struct PageServerConf { // How often to collect metrics and send them to the metrics endpoint. pub metric_collection_interval: Duration, // How often to send unchanged cached metrics to the metrics endpoint. - pub cached_metric_collection_interval: Duration, pub metric_collection_endpoint: Option, pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, @@ -370,7 +367,6 @@ struct PageServerConfigBuilder { concurrent_tenant_size_logical_size_queries: BuilderValue, metric_collection_interval: BuilderValue, - cached_metric_collection_interval: BuilderValue, metric_collection_endpoint: BuilderValue>, synthetic_size_calculation_interval: BuilderValue, metric_collection_bucket: BuilderValue>, @@ -454,10 +450,6 @@ impl PageServerConfigBuilder { DEFAULT_METRIC_COLLECTION_INTERVAL, ) .expect("cannot parse default metric collection interval")), - cached_metric_collection_interval: Set(humantime::parse_duration( - DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL, - ) - .expect("cannot parse default cached_metric_collection_interval")), synthetic_size_calculation_interval: Set(humantime::parse_duration( DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, ) @@ -589,14 +581,6 @@ impl PageServerConfigBuilder { self.metric_collection_interval = BuilderValue::Set(metric_collection_interval) } - pub fn cached_metric_collection_interval( - &mut self, - cached_metric_collection_interval: Duration, - ) { - self.cached_metric_collection_interval = - BuilderValue::Set(cached_metric_collection_interval) - } - pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option) { self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) } @@ -730,7 +714,6 @@ impl PageServerConfigBuilder { broker_keepalive_interval, log_format, metric_collection_interval, - cached_metric_collection_interval, metric_collection_endpoint, metric_collection_bucket, synthetic_size_calculation_interval, @@ -947,7 +930,6 @@ impl PageServerConf { NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? }), "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?), - "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?), "metric_collection_endpoint" => { let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; builder.metric_collection_endpoint(Some(endpoint)); @@ -1080,7 +1062,6 @@ impl PageServerConf { eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default( ), metric_collection_interval: Duration::from_secs(60), - cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(60), @@ -1259,7 +1240,6 @@ initial_superuser_name = 'zzzz' id = 10 metric_collection_interval = '222 s' -cached_metric_collection_interval = '22200 s' metric_collection_endpoint = 'http://localhost:80/metrics' synthetic_size_calculation_interval = '333 s' @@ -1315,9 +1295,6 @@ background_task_maximum_delay = '334 s' metric_collection_interval: humantime::parse_duration( defaults::DEFAULT_METRIC_COLLECTION_INTERVAL )?, - cached_metric_collection_interval: humantime::parse_duration( - defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL - )?, metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, metric_collection_bucket: None, synthetic_size_calculation_interval: humantime::parse_duration( @@ -1396,7 +1373,6 @@ background_task_maximum_delay = '334 s' eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(), metric_collection_interval: Duration::from_secs(222), - cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(333), diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 18c1a6cd9b..9104da6072 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -46,19 +46,12 @@ pub async fn collect_metrics( metric_collection_endpoint: &Url, metric_collection_bucket: &Option, metric_collection_interval: Duration, - _cached_metric_collection_interval: Duration, synthetic_size_calculation_interval: Duration, node_id: NodeId, local_disk_storage: Utf8PathBuf, cancel: CancellationToken, ctx: RequestContext, ) -> anyhow::Result<()> { - if _cached_metric_collection_interval != Duration::ZERO { - tracing::warn!( - "cached_metric_collection_interval is no longer used, please set it to zero." - ) - } - // spin up background worker that caclulates tenant sizes let worker_ctx = ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); @@ -103,7 +96,7 @@ pub async fn collect_metrics( .expect("Failed to create http client with timeout"); let bucket_client = if let Some(bucket_config) = metric_collection_bucket { - match GenericRemoteStorage::from_config(bucket_config) { + match GenericRemoteStorage::from_config(bucket_config).await { Ok(client) => Some(client), Err(e) => { // Non-fatal error: if we were given an invalid config, we will proceed diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 3e48552ace..22f7d5b824 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -828,9 +828,9 @@ mod test { } } - fn setup(test_name: &str) -> anyhow::Result { + async fn setup(test_name: &str) -> anyhow::Result { let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}"))); - let harness = TenantHarness::create(test_name)?; + let harness = TenantHarness::create(test_name).await?; // We do not load() the harness: we only need its config and remote_storage @@ -844,7 +844,9 @@ mod test { }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; - let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); + let storage = GenericRemoteStorage::from_config(&storage_config) + .await + .unwrap(); let mock_control_plane = MockControlPlane::new(); @@ -922,7 +924,9 @@ mod test { #[tokio::test] async fn deletion_queue_smoke() -> anyhow::Result<()> { // Basic test that the deletion queue processes the deletions we pass into it - let ctx = setup("deletion_queue_smoke").expect("Failed test setup"); + let ctx = setup("deletion_queue_smoke") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; @@ -992,7 +996,9 @@ mod test { #[tokio::test] async fn deletion_queue_validation() -> anyhow::Result<()> { - let ctx = setup("deletion_queue_validation").expect("Failed test setup"); + let ctx = setup("deletion_queue_validation") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; @@ -1051,7 +1057,9 @@ mod test { #[tokio::test] async fn deletion_queue_recovery() -> anyhow::Result<()> { // Basic test that the deletion queue processes the deletions we pass into it - let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup"); + let mut ctx = setup("deletion_queue_recovery") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index ae109ec1e7..087d281a0c 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -377,7 +377,7 @@ paths: schema: $ref: "#/components/schemas/ConflictError" - /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive: + /v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive: parameters: - name: tenant_id in: path @@ -397,6 +397,51 @@ paths: "202": description: Tenant scheduled to load successfully + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + put: + description: | + Either archives or unarchives the given timeline. + An archived timeline may not have any non-archived children. + requestBody: + required: false + content: + application/json: + schema: + $ref: "#/components/schemas/ArchivalConfigRequest" + responses: + "200": + description: Timeline (un)archived successfully + "409": + description: | + The tenant/timeline is already being modified, perhaps by a concurrent call to this API + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "503": + description: Temporarily unavailable, please retry. + content: + application/json: + schema: + $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/{tenant_id}/synthetic_size: parameters: - name: tenant_id @@ -429,7 +474,9 @@ paths: schema: $ref: "#/components/schemas/SyntheticSizeResponse" text/html: - description: SVG representation of the tenant and it's timelines. + schema: + type: string + description: SVG representation of the tenant and its timelines. "401": description: Unauthorized Error content: @@ -568,7 +615,7 @@ paths: type: string - name: timeline_id in: path - ŕequired: true + required: true schema: type: string @@ -774,15 +821,13 @@ components: TenantCreateRequest: allOf: - $ref: '#/components/schemas/TenantConfig' + - $ref: '#/components/schemas/TenantLoadRequest' - type: object required: - new_tenant_id properties: new_tenant_id: type: string - generation: - type: integer - description: Attachment generation number. TenantLoadRequest: type: object properties: @@ -846,6 +891,15 @@ components: warm: type: boolean description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything. + ArchivalConfigRequest: + type: object + required + - state + properties: + state: + description: The archival state of a timeline + type: string + enum: ["Archived", "Unarchived"] TenantConfig: type: object properties: @@ -1106,7 +1160,7 @@ components: reparented_timelines: type: array description: Set of reparented timeline ids - properties: + items: type: string format: hex description: TimelineId diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 6f8f3e6389..b8063eb5a2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -18,14 +18,17 @@ use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; use pageserver_api::models::IngestAuxFilesRequest; use pageserver_api::models::ListAuxFilesRequest; use pageserver_api::models::LocationConfig; use pageserver_api::models::LocationConfigListResponse; +use pageserver_api::models::LocationConfigMode; use pageserver_api::models::LsnLease; use pageserver_api::models::LsnLeaseRequest; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; +use pageserver_api::models::TenantLocationConfigRequest; use pageserver_api::models::TenantLocationConfigResponse; use pageserver_api::models::TenantScanRemoteStorageResponse; use pageserver_api::models::TenantScanRemoteStorageShard; @@ -33,12 +36,10 @@ use pageserver_api::models::TenantShardLocation; use pageserver_api::models::TenantShardSplitRequest; use pageserver_api::models::TenantShardSplitResponse; use pageserver_api::models::TenantSorting; +use pageserver_api::models::TimelineArchivalConfigRequest; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::TopTenantShardsRequest; use pageserver_api::models::TopTenantShardsResponse; -use pageserver_api::models::{ - DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest, -}; use pageserver_api::shard::ShardCount; use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; @@ -664,6 +665,39 @@ async fn timeline_preserve_initdb_handler( json_response(StatusCode::OK, ()) } +async fn timeline_archival_config_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant + .apply_timeline_archival_config(timeline_id, request_data.state) + .await + .context("applying archival config") + .map_err(ApiError::InternalServerError)?; + Ok::<_, ApiError>(()) + } + .instrument(info_span!("timeline_archival_config", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + state = ?request_data.state, + %timeline_id)) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, @@ -1721,7 +1755,9 @@ async fn timeline_detach_ancestor_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - use crate::tenant::timeline::detach_ancestor::Options; + use crate::tenant::timeline::detach_ancestor; + use pageserver_api::models::detach_ancestor::AncestorDetached; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -1729,7 +1765,7 @@ async fn timeline_detach_ancestor_handler( let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); async move { - let mut options = Options::default(); + let mut options = detach_ancestor::Options::default(); let rewrite_concurrency = parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?; @@ -1757,27 +1793,36 @@ async fn timeline_detach_ancestor_handler( let timeline = tenant.get_timeline(timeline_id, true)?; - let (_guard, prepared) = timeline + let progress = timeline .prepare_to_detach_from_ancestor(&tenant, options, ctx) .await?; - let res = state - .tenant_manager - .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx) - .await; + // uncomment to allow early as possible Tenant::drop + // drop(tenant); - match res { - Ok(reparented_timelines) => { - let resp = pageserver_api::models::detach_ancestor::AncestorDetached { + let resp = match progress { + detach_ancestor::Progress::Prepared(_guard, prepared) => { + // it would be great to tag the guard on to the tenant activation future + let reparented_timelines = state + .tenant_manager + .complete_detaching_timeline_ancestor( + tenant_shard_id, + timeline_id, + prepared, + ctx, + ) + .await + .context("timeline detach ancestor completion") + .map_err(ApiError::InternalServerError)?; + + AncestorDetached { reparented_timelines, - }; - - json_response(StatusCode::OK, resp) + } } - Err(e) => Err(ApiError::InternalServerError( - e.context("timeline detach completion"), - )), - } + detach_ancestor::Progress::Done(resp) => resp, + }; + + json_response(StatusCode::OK, resp) } .instrument(span) .await @@ -2778,6 +2823,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive", |r| api_handler(r, timeline_preserve_initdb_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config", + |r| api_handler(r, timeline_archival_config_handler), + ) .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_detail_handler) }) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9b3bb481b9..c03567f6ef 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum MetricLayerKind { + Delta, + Image, +} + +static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_bytes", + "Sum of layer physical sizes in bytes", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + +static TIMELINE_LAYER_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_count", + "Number of layers that exist", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + static TIMELINE_ARCHIVE_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_archive_size", @@ -585,6 +610,22 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_in_bytes_total", + "Size of uncompressed data written into image layers" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_out_bytes_total", + "Size of compressed image layer written" + ) + .expect("failed to define a metric") +}); + pub(crate) mod initial_logical_size { use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; @@ -1490,7 +1531,6 @@ pub(crate) enum ComputeCommandKind { Basebackup, Fullbackup, LeaseLsn, - Show, } pub(crate) struct ComputeCommandCounters { @@ -2142,6 +2182,10 @@ pub(crate) struct TimelineMetrics { pub last_record_gauge: IntGauge, pub pitr_history_size: UIntGauge, pub archival_size: UIntGauge, + pub(crate) layer_size_image: UIntGauge, + pub(crate) layer_count_image: UIntGauge, + pub(crate) layer_size_delta: UIntGauge, + pub(crate) layer_count_delta: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size @@ -2224,6 +2268,42 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let layer_size_image = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_count_image = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_size_delta = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + + let layer_count_delta = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2278,6 +2358,10 @@ impl TimelineMetrics { last_record_gauge, pitr_history_size, archival_size, + layer_size_image, + layer_count_image, + layer_size_delta, + layer_count_delta, standby_horizon_gauge, resident_physical_size_gauge, current_logical_size_gauge, @@ -2339,6 +2423,31 @@ impl TimelineMetrics { let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f94b0d335e..00147a8ca6 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1479,66 +1479,6 @@ where ))? } }; - } else if let Some(params) = parts.strip_prefix(&["show"]) { - // show - if params.len() != 1 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for config command" - ))); - } - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - - tracing::Span::current().record("tenant_id", field::display(tenant_id)); - - self.check_permission(Some(tenant_id))?; - - COMPUTE_COMMANDS_COUNTERS - .for_command(ComputeCommandKind::Show) - .inc(); - - let tenant = self - .get_active_tenant_with_timeout( - tenant_id, - ShardSelector::Zero, - ACTIVE_TENANT_TIMEOUT, - ) - .await?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::int8_col(b"checkpoint_distance"), - RowDescriptor::int8_col(b"checkpoint_timeout"), - RowDescriptor::int8_col(b"compaction_target_size"), - RowDescriptor::int8_col(b"compaction_period"), - RowDescriptor::int8_col(b"compaction_threshold"), - RowDescriptor::int8_col(b"gc_horizon"), - RowDescriptor::int8_col(b"gc_period"), - RowDescriptor::int8_col(b"image_creation_threshold"), - RowDescriptor::int8_col(b"pitr_interval"), - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(tenant.get_checkpoint_distance().to_string().as_bytes()), - Some( - tenant - .get_checkpoint_timeout() - .as_secs() - .to_string() - .as_bytes(), - ), - Some(tenant.get_compaction_target_size().to_string().as_bytes()), - Some( - tenant - .get_compaction_period() - .as_secs() - .to_string() - .as_bytes(), - ), - Some(tenant.get_compaction_threshold().to_string().as_bytes()), - Some(tenant.get_gc_horizon().to_string().as_bytes()), - Some(tenant.get_gc_period().as_secs().to_string().as_bytes()), - Some(tenant.get_image_creation_threshold().to_string().as_bytes()), - Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), - ]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { return Err(QueryError::Other(anyhow::anyhow!( "unknown command {query_string}" diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index a821b824d0..3bbd084ab4 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -2031,7 +2031,7 @@ mod tests { #[tokio::test] async fn aux_files_round_trip() -> anyhow::Result<()> { let name = "aux_files_round_trip"; - let harness = TenantHarness::create(name)?; + let harness = TenantHarness::create(name).await?; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 6333fd3b63..6d59752606 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -21,6 +21,7 @@ use futures::FutureExt; use futures::StreamExt; use pageserver_api::models; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::TimelineArchivalState; use pageserver_api::models::TimelineState; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::WalRedoManagerStatus; @@ -1228,6 +1229,14 @@ impl Tenant { Ok(timeline_preloads) } + pub async fn apply_timeline_archival_config( + &self, + _timeline_id: TimelineId, + _config: TimelineArchivalState, + ) -> anyhow::Result<()> { + Ok(()) + } + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { self.tenant_shard_id } @@ -2912,7 +2921,7 @@ impl Tenant { if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) { target.within_ancestor_pitr = - timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr; + timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time; } } @@ -2928,7 +2937,7 @@ impl Tenant { timeline.metrics.pitr_history_size.set( timeline .get_last_record_lsn() - .checked_sub(target.cutoffs.pitr) + .checked_sub(target.cutoffs.time) .unwrap_or(Lsn(0)) .0, ); @@ -3788,7 +3797,7 @@ pub(crate) mod harness { } impl TenantHarness { - pub fn create_custom( + pub async fn create_custom( test_name: &'static str, tenant_conf: TenantConf, tenant_id: TenantId, @@ -3824,7 +3833,7 @@ pub(crate) mod harness { }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; - let remote_storage = GenericRemoteStorage::from_config(&config).unwrap(); + let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap(); let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone())); Ok(Self { @@ -3839,7 +3848,7 @@ pub(crate) mod harness { }) } - pub fn create(test_name: &'static str) -> anyhow::Result { + pub async fn create(test_name: &'static str) -> anyhow::Result { // Disable automatic GC and compaction to make the unit tests more deterministic. // The tests perform them manually if needed. let tenant_conf = TenantConf { @@ -3856,6 +3865,7 @@ pub(crate) mod harness { shard, Generation::new(0xdeadbeef), ) + .await } pub fn span(&self) -> tracing::Span { @@ -3992,7 +4002,7 @@ mod tests { #[tokio::test] async fn test_basic() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4039,7 +4049,8 @@ mod tests { #[tokio::test] async fn no_duplicate_timelines() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")? + let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines") + .await? .load() .await; let _ = tenant @@ -4071,7 +4082,7 @@ mod tests { async fn test_branch() -> anyhow::Result<()> { use std::str::from_utf8; - let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4193,7 +4204,8 @@ mod tests { #[tokio::test] async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? + TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data") + .await? .load() .await; let tline = tenant @@ -4240,7 +4252,8 @@ mod tests { #[tokio::test] async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")? + TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn") + .await? .load() .await; @@ -4262,7 +4275,7 @@ mod tests { .source() .unwrap() .to_string() - .contains("is earlier than latest GC horizon")); + .contains("is earlier than latest GC cutoff")); } } @@ -4295,7 +4308,8 @@ mod tests { #[tokio::test] async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")? + TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline") + .await? .load() .await; let tline = tenant @@ -4352,7 +4366,8 @@ mod tests { #[tokio::test] async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")? + TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child") + .await? .load() .await; let tline = tenant @@ -4382,10 +4397,10 @@ mod tests { } #[tokio::test] async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> { - let (tenant, ctx) = - TenantHarness::create("test_parent_keeps_data_forever_after_branching")? - .load() - .await; + let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4423,7 +4438,7 @@ mod tests { #[tokio::test] async fn timeline_load() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load"; - let harness = TenantHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME).await?; { let (tenant, ctx) = harness.load().await; let tline = tenant @@ -4450,7 +4465,7 @@ mod tests { #[tokio::test] async fn timeline_load_with_ancestor() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load_with_ancestor"; - let harness = TenantHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME).await?; // create two timelines { let (tenant, ctx) = harness.load().await; @@ -4498,7 +4513,10 @@ mod tests { #[tokio::test] async fn delta_layer_dumping() -> anyhow::Result<()> { use storage_layer::AsLayerDesc; - let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_layer_dumping") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4525,7 +4543,7 @@ mod tests { #[tokio::test] async fn test_images() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_images")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4696,7 +4714,7 @@ mod tests { // #[tokio::test] async fn test_bulk_insert() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_bulk_insert")?; + let harness = TenantHarness::create("test_bulk_insert").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) @@ -4727,7 +4745,7 @@ mod tests { // so the search can stop at the first delta layer and doesn't traverse any deeper. #[tokio::test] async fn test_get_vectored() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_get_vectored")?; + let harness = TenantHarness::create("test_get_vectored").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) @@ -4805,7 +4823,7 @@ mod tests { #[tokio::test] async fn test_get_vectored_aux_files() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_get_vectored_aux_files")?; + let harness = TenantHarness::create("test_get_vectored_aux_files").await?; let (tenant, ctx) = harness.load().await; let tline = tenant @@ -4891,7 +4909,8 @@ mod tests { TenantId::generate(), ShardIdentity::unsharded(), Generation::new(0xdeadbeef), - )?; + ) + .await?; let (tenant, ctx) = harness.load().await; let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); @@ -5034,7 +5053,7 @@ mod tests { // ``` #[tokio::test] async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?; + let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?; let (tenant, ctx) = harness.load().await; let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); @@ -5183,7 +5202,7 @@ mod tests { name: &'static str, compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { - let mut harness = TenantHarness::create(name)?; + let mut harness = TenantHarness::create(name).await?; harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { kind: compaction_algorithm, }; @@ -5267,7 +5286,8 @@ mod tests { #[tokio::test] async fn test_traverse_branches() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_traverse_branches")? + let (tenant, ctx) = TenantHarness::create("test_traverse_branches") + .await? .load() .await; let mut tline = tenant @@ -5357,7 +5377,8 @@ mod tests { #[tokio::test] async fn test_traverse_ancestors() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")? + let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors") + .await? .load() .await; let mut tline = tenant @@ -5423,7 +5444,8 @@ mod tests { #[tokio::test] async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")? + let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable") + .await? .load() .await; @@ -5492,7 +5514,7 @@ mod tests { #[tokio::test] async fn test_create_guard_crash() -> anyhow::Result<()> { let name = "test_create_guard_crash"; - let harness = TenantHarness::create(name)?; + let harness = TenantHarness::create(name).await?; { let (tenant, ctx) = harness.load().await; let tline = tenant @@ -5545,7 +5567,7 @@ mod tests { name: &'static str, compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { - let mut harness = TenantHarness::create(name)?; + let mut harness = TenantHarness::create(name).await?; harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { kind: compaction_algorithm, }; @@ -5569,7 +5591,7 @@ mod tests { #[tokio::test] async fn test_metadata_scan() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_scan")?; + let harness = TenantHarness::create("test_metadata_scan").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -5688,7 +5710,7 @@ mod tests { #[tokio::test] async fn test_metadata_compaction_trigger() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_compaction_trigger")?; + let harness = TenantHarness::create("test_metadata_compaction_trigger").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -5747,7 +5769,9 @@ mod tests { #[tokio::test] async fn test_branch_copies_dirty_aux_file_flag() { - let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap(); + let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag") + .await + .unwrap(); // the default aux file policy to switch is v1 if not set by the admins assert_eq!( @@ -5849,7 +5873,9 @@ mod tests { #[tokio::test] async fn aux_file_policy_switch() { - let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap(); + let mut harness = TenantHarness::create("aux_file_policy_switch") + .await + .unwrap(); harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode let (tenant, ctx) = harness.load().await; @@ -6023,7 +6049,9 @@ mod tests { #[tokio::test] async fn aux_file_policy_force_switch() { - let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap(); + let mut harness = TenantHarness::create("aux_file_policy_force_switch") + .await + .unwrap(); harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1; let (tenant, ctx) = harness.load().await; @@ -6084,7 +6112,9 @@ mod tests { #[tokio::test] async fn aux_file_policy_auto_detect() { - let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap(); + let mut harness = TenantHarness::create("aux_file_policy_auto_detect") + .await + .unwrap(); harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode let (tenant, ctx) = harness.load().await; @@ -6147,7 +6177,7 @@ mod tests { #[tokio::test] async fn test_metadata_image_creation() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_image_creation")?; + let harness = TenantHarness::create("test_metadata_image_creation").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -6246,7 +6276,7 @@ mod tests { #[tokio::test] async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; + let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?; let (tenant, ctx) = harness.load().await; let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); @@ -6318,7 +6348,7 @@ mod tests { #[tokio::test] async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?; + let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?; let (tenant, ctx) = harness.load().await; let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); @@ -6410,7 +6440,7 @@ mod tests { #[tokio::test] async fn test_metadata_tombstone_reads() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_tombstone_reads")?; + let harness = TenantHarness::create("test_metadata_tombstone_reads").await?; let (tenant, ctx) = harness.load().await; let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -6490,7 +6520,9 @@ mod tests { #[tokio::test] async fn test_metadata_tombstone_image_creation() { - let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap(); + let harness = TenantHarness::create("test_metadata_tombstone_image_creation") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); @@ -6562,8 +6594,9 @@ mod tests { #[tokio::test] async fn test_metadata_tombstone_empty_image_creation() { - let harness = - TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap(); + let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -6626,7 +6659,7 @@ mod tests { #[tokio::test] async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?; + let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { @@ -6718,8 +6751,8 @@ mod tests { { // Update GC info let mut guard = tline.gc_info.write().unwrap(); - guard.cutoffs.pitr = Lsn(0x30); - guard.cutoffs.horizon = Lsn(0x30); + guard.cutoffs.time = Lsn(0x30); + guard.cutoffs.space = Lsn(0x30); } let expected_result = [ @@ -6810,7 +6843,7 @@ mod tests { vec![ // Image layer at GC horizon PersistentLayerKey { - key_range: Key::MIN..get_key(10), + key_range: Key::MIN..Key::MAX, lsn_range: Lsn(0x30)..Lsn(0x31), is_delta: false }, @@ -6834,7 +6867,7 @@ mod tests { #[tokio::test] async fn test_neon_test_record() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_neon_test_record")?; + let harness = TenantHarness::create("test_neon_test_record").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { @@ -6915,7 +6948,7 @@ mod tests { #[tokio::test] async fn test_lsn_lease() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await; let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_lsn = Lsn(0x100); @@ -7004,7 +7037,7 @@ mod tests { #[tokio::test] async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas")?; + let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { @@ -7109,8 +7142,8 @@ mod tests { *guard = GcInfo { retain_lsns: vec![], cutoffs: GcCutoffs { - pitr: Lsn(0x30), - horizon: Lsn(0x30), + time: Lsn(0x30), + space: Lsn(0x30), }, leases: Default::default(), within_ancestor_pitr: false, diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 251d2ab4ad..1583a3826a 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -262,7 +262,7 @@ where pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a> where - R: 'a, + R: 'a + Send, { DiskBtreeIterator { stream: Box::pin(self.into_stream(start_key, ctx)), @@ -521,7 +521,7 @@ where pub struct DiskBtreeIterator<'a> { #[allow(clippy::type_complexity)] stream: std::pin::Pin< - Box, u64), DiskBtreeError>> + 'a>, + Box, u64), DiskBtreeError>> + 'a + Send>, >, } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index b0159e22bf..4912608677 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2698,7 +2698,9 @@ mod tests { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully // wait for it to complete before proceeding. - let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap(); + let h = TenantHarness::create("shutdown_awaits_in_progress_tenant") + .await + .unwrap(); let (t, _ctx) = h.load().await; // harness loads it to active, which is forced and nothing is running on the tenant diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index bc9364de61..bb42fbeebf 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -241,7 +241,7 @@ use self::index::IndexPart; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; -use super::upload_queue::SetDeletedFlagProgress; +use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::Generation; pub(crate) use download::{ @@ -1930,6 +1930,31 @@ impl RemoteTimelineClient { } } } + + /// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue + /// externally to RemoteTimelineClient. + pub(crate) fn initialized_upload_queue( + &self, + ) -> Result, NotInitialized> { + let mut inner = self.upload_queue.lock().unwrap(); + inner.initialized_mut()?; + Ok(UploadQueueAccessor { inner }) + } +} + +pub(crate) struct UploadQueueAccessor<'a> { + inner: std::sync::MutexGuard<'a, UploadQueue>, +} + +impl<'a> UploadQueueAccessor<'a> { + pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart { + match &*self.inner { + UploadQueue::Initialized(x) => &x.clean.0, + UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { + unreachable!("checked before constructing") + } + } + } } pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath { @@ -2103,7 +2128,7 @@ mod tests { impl TestSetup { async fn new(test_name: &str) -> anyhow::Result { let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}"))); - let harness = TenantHarness::create(test_name)?; + let harness = TenantHarness::create(test_name).await?; let (tenant, ctx) = harness.load().await; let timeline = tenant diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 6233a3477e..b439df8edb 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -176,6 +176,24 @@ pub(crate) struct Lineage { /// /// If you are adding support for detaching from a hierarchy, consider changing the ancestry /// into a `Vec<(TimelineId, Lsn)>` to be a path instead. + // FIXME: this is insufficient even for path of two timelines for future wal recovery + // purposes: + // + // assuming a "old main" which has received most of the WAL, and has a branch "new main", + // starting a bit before "old main" last_record_lsn. the current version works fine, + // because we will know to replay wal and branch at the recorded Lsn to do wal recovery. + // + // then assuming "new main" would similarly receive a branch right before its last_record_lsn, + // "new new main". the current implementation would just store ("new main", ancestor_lsn, _) + // here. however, we cannot recover from WAL using only that information, we would need the + // whole ancestry here: + // + // ```json + // [ + // ["old main", ancestor_lsn("new main"), _], + // ["new main", ancestor_lsn("new new main"), _] + // ] + // ``` #[serde(skip_serializing_if = "Option::is_none", default)] original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>, } @@ -217,6 +235,14 @@ impl Lineage { self.original_ancestor .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn) } + + pub(crate) fn is_detached_from_original_ancestor(&self) -> bool { + self.original_ancestor.is_some() + } + + pub(crate) fn is_reparented(&self) -> bool { + !self.reparenting_history.is_empty() + } } #[cfg(test)] diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 23354417e7..e4728ca8a8 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -135,11 +135,9 @@ pub struct TimelineInputs { ancestor_lsn: Lsn, last_record: Lsn, latest_gc_cutoff: Lsn, - horizon_cutoff: Lsn, - pitr_cutoff: Lsn, /// Cutoff point based on GC settings - next_gc_cutoff: Lsn, + next_pitr_cutoff: Lsn, /// Cutoff point calculated from the user-supplied 'max_retention_period' retention_param_cutoff: Option, @@ -150,7 +148,7 @@ pub struct TimelineInputs { /// Gathers the inputs for the tenant sizing model. /// -/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which +/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which /// is updated on-demand, during the start of this calculation and separate from the /// [`TimelineInputs::latest_gc_cutoff`]. /// @@ -158,11 +156,8 @@ pub struct TimelineInputs { /// /// ```text /// 0-----|---------|----|------------| · · · · · |·> lsn -/// initdb_lsn branchpoints* next_gc_cutoff latest +/// initdb_lsn branchpoints* next_pitr_cutoff latest /// ``` -/// -/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the -/// tenant size will be zero. pub(super) async fn gather_inputs( tenant: &Tenant, limit: &Arc, @@ -172,7 +167,7 @@ pub(super) async fn gather_inputs( cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { - // refresh is needed to update gc related pitr_cutoff and horizon_cutoff + // refresh is needed to update [`timeline::GcCutoffs`] tenant.refresh_gc_info(cancel, ctx).await?; // Collect information about all the timelines @@ -236,20 +231,18 @@ pub(super) async fn gather_inputs( // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not // actually removing files. // - // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from + // We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather - // than a space bound (horizon cutoff). This means that if someone drops a database and waits for their + // than our internal space cutoff. This means that if someone drops a database and waits for their // PITR interval, they will see synthetic size decrease, even if we are still storing data inside - // horizon_cutoff. - let pitr_cutoff = gc_info.cutoffs.pitr; - let horizon_cutoff = gc_info.cutoffs.horizon; - let mut next_gc_cutoff = pitr_cutoff; + // the space cutoff. + let mut next_pitr_cutoff = gc_info.cutoffs.time; // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period)); - if next_gc_cutoff < param_cutoff { - next_gc_cutoff = param_cutoff; + if next_pitr_cutoff < param_cutoff { + next_pitr_cutoff = param_cutoff; } Some(param_cutoff) } else { @@ -263,7 +256,7 @@ pub(super) async fn gather_inputs( .copied() .collect::>(); - // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we + // next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we // want to query any logical size before initdb_lsn. let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn); @@ -291,10 +284,10 @@ pub(super) async fn gather_inputs( ) } - // Add a point for the GC cutoff - let branch_start_needed = next_gc_cutoff <= branch_start_lsn; + // Add a point for the PITR cutoff + let branch_start_needed = next_pitr_cutoff <= branch_start_lsn; if !branch_start_needed { - lsns.push((next_gc_cutoff, LsnKind::GcCutOff)); + lsns.push((next_pitr_cutoff, LsnKind::GcCutOff)); } lsns.sort_unstable(); @@ -333,7 +326,7 @@ pub(super) async fn gather_inputs( parent: Some(parent), lsn: lsn.0, size: None, - needed: lsn > next_gc_cutoff, + needed: lsn > next_pitr_cutoff, }, timeline_id: timeline.timeline_id, kind, @@ -357,8 +350,8 @@ pub(super) async fn gather_inputs( segment: Segment { parent: Some(lease_parent), lsn: lsn.0, - size: None, // Filled in later, if necessary - needed: lsn > next_gc_cutoff, // only needed if the point is within rentention. + size: None, // Filled in later, if necessary + needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention. }, timeline_id: timeline.timeline_id, kind: LsnKind::LeaseStart, @@ -398,9 +391,7 @@ pub(super) async fn gather_inputs( last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), - horizon_cutoff, - pitr_cutoff, - next_gc_cutoff, + next_pitr_cutoff, retention_param_cutoff, lease_points, }); @@ -742,9 +733,7 @@ fn verify_size_for_multiple_branches() { "ancestor_lsn": "0/18D3D98", "last_record": "0/2230CD0", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/2210CD0", - "pitr_cutoff": "0/2210CD0", - "next_gc_cutoff": "0/2210CD0", + "next_pitr_cutoff": "0/2210CD0", "retention_param_cutoff": null, "lease_points": [] }, @@ -753,9 +742,7 @@ fn verify_size_for_multiple_branches() { "ancestor_lsn": "0/176D998", "last_record": "0/1837770", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/1817770", - "pitr_cutoff": "0/1817770", - "next_gc_cutoff": "0/1817770", + "next_pitr_cutoff": "0/1817770", "retention_param_cutoff": null, "lease_points": [] }, @@ -764,9 +751,7 @@ fn verify_size_for_multiple_branches() { "ancestor_lsn": "0/0", "last_record": "0/18D3D98", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/18B3D98", - "pitr_cutoff": "0/18B3D98", - "next_gc_cutoff": "0/18B3D98", + "next_pitr_cutoff": "0/18B3D98", "retention_param_cutoff": null, "lease_points": [] } @@ -820,9 +805,7 @@ fn verify_size_for_one_branch() { "ancestor_lsn": "0/0", "last_record": "47/280A5860", "latest_gc_cutoff": "47/240A5860", - "horizon_cutoff": "47/240A5860", - "pitr_cutoff": "47/240A5860", - "next_gc_cutoff": "47/240A5860", + "next_pitr_cutoff": "47/240A5860", "retention_param_cutoff": "0/0", "lease_points": [] } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 62730f88b2..a389358f0d 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -6,8 +6,6 @@ pub(crate) mod inmemory_layer; pub(crate) mod layer; mod layer_desc; mod layer_name; - -#[cfg(test)] pub mod merge_iterator; use crate::context::{AccessStatsBehavior, RequestContext}; @@ -676,6 +674,26 @@ impl LayerAccessStats { }, } } + + /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]). + /// + /// This indicates whether the layer has been used for some purpose that would motivate + /// us to keep it on disk, such as for serving a getpage request. + fn accessed(&self) -> bool { + let locked = self.0.lock().unwrap(); + let inner = &locked.for_eviction_policy; + + // Consider it accessed if the most recent access is more recent than + // the most recent change in residence status. + match ( + inner.last_accesses.recent(), + inner.last_residence_changes.recent(), + ) { + (None, _) => false, + (Some(_), None) => true, + (Some(a), Some(r)) => a.when >= r.timestamp, + } + } } /// Get a layer descriptor from a layer. diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 2d36ac7442..512e9e86fa 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -33,11 +33,14 @@ use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader}; -use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::disk_btree::{ + DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, +}; use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, + BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + VectoredReadPlanner, }; use crate::tenant::{PageReconstructError, Timeline}; use crate::virtual_file::{self, VirtualFile}; @@ -53,6 +56,7 @@ use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind}; use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; +use std::collections::VecDeque; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; @@ -747,12 +751,10 @@ impl DeltaLayer { } impl DeltaLayerInner { - #[cfg(test)] pub(crate) fn key_range(&self) -> &Range { &self.layer_key_range } - #[cfg(test)] pub(crate) fn lsn_range(&self) -> &Range { &self.layer_lsn_range } @@ -1180,9 +1182,7 @@ impl DeltaLayerInner { let delta_key = DeltaKey::from_slice(key); let val_ref = ValueRef { blob_ref: BlobRef(value), - reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter( - Adapter(self), - )), + layer: self, }; let pos = BlobRef(value).pos(); if let Some(last) = all_keys.last_mut() { @@ -1426,7 +1426,7 @@ impl DeltaLayerInner { let keys = self.load_keys(ctx).await?; async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result { - let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; + let buf = val.load_raw(ctx).await?; let val = Value::des(&buf)?; let desc = match val { Value::Image(img) => { @@ -1461,8 +1461,7 @@ impl DeltaLayerInner { use pageserver_api::key::CHECKPOINT_KEY; use postgres_ffi::CheckPoint; if key == CHECKPOINT_KEY { - let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; - let val = Value::des(&buf)?; + let val = val.load(ctx).await?; match val { Value::Image(img) => { let checkpoint = CheckPoint::decode(&img)?; @@ -1515,7 +1514,6 @@ impl DeltaLayerInner { offset } - #[cfg(test)] pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> { let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = @@ -1526,7 +1524,7 @@ impl DeltaLayerInner { index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx), key_values_batch: std::collections::VecDeque::new(), is_end: false, - planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new( + planner: StreamingVectoredReadPlanner::new( 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. 1024, // The default value. Unit tests might use a different value ), @@ -1547,17 +1545,24 @@ pub struct DeltaEntry<'a> { /// Reference to an on-disk value pub struct ValueRef<'a> { blob_ref: BlobRef, - reader: BlockCursor<'a>, + layer: &'a DeltaLayerInner, } impl<'a> ValueRef<'a> { /// Loads the value from disk pub async fn load(&self, ctx: &RequestContext) -> Result { - // theoretically we *could* record an access time for each, but it does not really matter - let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?; + let buf = self.load_raw(ctx).await?; let val = Value::des(&buf)?; Ok(val) } + + async fn load_raw(&self, ctx: &RequestContext) -> Result> { + let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter( + self.layer, + ))); + let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?; + Ok(buf) + } } pub(crate) struct Adapter(T); @@ -1591,17 +1596,15 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del } } -#[cfg(test)] pub struct DeltaLayerIterator<'a> { delta_layer: &'a DeltaLayerInner, ctx: &'a RequestContext, - planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner, - index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>, - key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>, + planner: StreamingVectoredReadPlanner, + index_iter: DiskBtreeIterator<'a>, + key_values_batch: VecDeque<(Key, Lsn, Value)>, is_end: bool, } -#[cfg(test)] impl<'a> DeltaLayerIterator<'a> { /// Retrieve a batch of key-value pairs into the iterator buffer. async fn next_batch(&mut self) -> anyhow::Result<()> { @@ -1668,6 +1671,7 @@ pub(crate) mod test { use rand::RngCore; use super::*; + use crate::repository::Value; use crate::tenant::harness::TIMELINE_ID; use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::Tenant; @@ -1677,6 +1681,7 @@ pub(crate) mod test { tenant::{disk_btree::tests::TestDisk, harness::TenantHarness}, DEFAULT_PG_VERSION, }; + use bytes::Bytes; /// Construct an index for a fictional delta layer and and then /// traverse in order to plan vectored reads for a query. Finally, @@ -1929,7 +1934,7 @@ pub(crate) mod test { #[tokio::test] async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?; + let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?; let (tenant, ctx) = harness.load().await; let timeline_id = TimelineId::generate(); @@ -2029,7 +2034,9 @@ pub(crate) mod test { use crate::walrecord::NeonWalRecord; use bytes::Bytes; - let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap(); + let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let ctx = &ctx; let timeline = tenant @@ -2245,6 +2252,15 @@ pub(crate) mod test { (k1, l1).cmp(&(k2, l2)) } + pub(crate) fn sort_delta_value( + (k1, l1, v1): &(Key, Lsn, Value), + (k2, l2, v2): &(Key, Lsn, Value), + ) -> std::cmp::Ordering { + let order_1 = if v1.is_image() { 0 } else { 1 }; + let order_2 = if v2.is_image() { 0 } else { 1 }; + (k1, l1, order_1).cmp(&(k2, l2, order_2)) + } + pub(crate) async fn produce_delta_layer( tenant: &Tenant, tline: &Arc, @@ -2253,7 +2269,7 @@ pub(crate) mod test { ) -> anyhow::Result { deltas.sort_by(sort_delta); let (key_start, _, _) = deltas.first().unwrap(); - let (key_max, _, _) = deltas.first().unwrap(); + let (key_max, _, _) = deltas.last().unwrap(); let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap(); let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap(); let lsn_end = Lsn(lsn_max.0 + 1); @@ -2298,10 +2314,7 @@ pub(crate) mod test { #[tokio::test] async fn delta_layer_iterator() { - use crate::repository::Value; - use bytes::Bytes; - - let harness = TenantHarness::create("delta_layer_iterator").unwrap(); + let harness = TenantHarness::create("delta_layer_iterator").await.unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index a88a1e6429..19e4e9e2e9 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -29,13 +29,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; -use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::disk_btree::{ + DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, +}; use crate::tenant::storage_layer::{ LayerAccessStats, ValueReconstructResult, ValueReconstructState, }; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, + BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + VectoredReadPlanner, }; use crate::tenant::{PageReconstructError, Timeline}; use crate::virtual_file::{self, VirtualFile}; @@ -50,6 +53,7 @@ use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; +use std::collections::VecDeque; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; @@ -369,12 +373,10 @@ impl ImageLayer { } impl ImageLayerInner { - #[cfg(test)] pub(crate) fn key_range(&self) -> &Range { &self.key_range } - #[cfg(test)] pub(crate) fn lsn(&self) -> Lsn { self.lsn } @@ -699,7 +701,6 @@ impl ImageLayerInner { } } - #[cfg(test)] pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> { let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = @@ -708,9 +709,9 @@ impl ImageLayerInner { image_layer: self, ctx, index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx), - key_values_batch: std::collections::VecDeque::new(), + key_values_batch: VecDeque::new(), is_end: false, - planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new( + planner: StreamingVectoredReadPlanner::new( 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. 1024, // The default value. Unit tests might use a different value ), @@ -737,6 +738,9 @@ struct ImageLayerWriterInner { key_range: Range, lsn: Lsn, + // Total uncompressed bytes passed into put_image + uncompressed_bytes: u64, + blob_writer: BlobWriter, tree: DiskBtreeBuilder, } @@ -792,6 +796,7 @@ impl ImageLayerWriterInner { lsn, tree: tree_builder, blob_writer, + uncompressed_bytes: 0, }; Ok(writer) @@ -810,6 +815,7 @@ impl ImageLayerWriterInner { ) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); let compression = self.conf.image_compression; + self.uncompressed_bytes += img.len() as u64; let (_img, res) = self .blob_writer .write_blob_maybe_compressed(img, ctx, compression) @@ -835,6 +841,11 @@ impl ImageLayerWriterInner { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + // Calculate compression ratio + let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header + crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes); + crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size); + let mut file = self.blob_writer.into_inner(); // Write out the index @@ -974,17 +985,15 @@ impl Drop for ImageLayerWriter { } } -#[cfg(test)] pub struct ImageLayerIterator<'a> { image_layer: &'a ImageLayerInner, ctx: &'a RequestContext, - planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner, - index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>, - key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>, + planner: StreamingVectoredReadPlanner, + index_iter: DiskBtreeIterator<'a>, + key_values_batch: VecDeque<(Key, Lsn, Value)>, is_end: bool, } -#[cfg(test)] impl<'a> ImageLayerIterator<'a> { /// Retrieve a batch of key-value pairs into the iterator buffer. async fn next_batch(&mut self) -> anyhow::Result<()> { @@ -1102,6 +1111,7 @@ mod test { ShardIdentity::unsharded(), get_next_gen(), ) + .await .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant @@ -1168,6 +1178,7 @@ mod test { // But here, all we care about is that the gen number is unique. get_next_gen(), ) + .await .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant @@ -1299,7 +1310,7 @@ mod test { #[tokio::test] async fn image_layer_iterator() { - let harness = TenantHarness::create("image_layer_iterator").unwrap(); + let harness = TenantHarness::create("image_layer_iterator").await.unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 02069c29d2..d9cbaba529 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -385,6 +385,7 @@ impl Layer { } /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future. + #[allow(dead_code)] pub(crate) async fn load_key_values( &self, ctx: &RequestContext, @@ -693,6 +694,18 @@ impl Drop for LayerInner { // and we could be delaying shutdown for nothing. } + if let Some(timeline) = self.timeline.upgrade() { + // Only need to decrement metrics if the timeline still exists: otherwise + // it will have already de-registered these metrics via TimelineMetrics::shutdown + if self.desc.is_delta() { + timeline.metrics.layer_count_delta.dec(); + timeline.metrics.layer_size_delta.sub(self.desc.file_size); + } else { + timeline.metrics.layer_count_image.dec(); + timeline.metrics.layer_size_image.sub(self.desc.file_size); + } + } + if !*self.wanted_deleted.get_mut() { return; } @@ -791,6 +804,15 @@ impl LayerInner { (heavier_once_cell::OnceCell::default(), 0, Status::Evicted) }; + // This object acts as a RAII guard on these metrics: increment on construction + if desc.is_delta() { + timeline.metrics.layer_count_delta.inc(); + timeline.metrics.layer_size_delta.add(desc.file_size); + } else { + timeline.metrics.layer_count_image.inc(); + timeline.metrics.layer_size_image.add(desc.file_size); + } + LayerInner { conf, debug_str: { @@ -1469,14 +1491,22 @@ impl LayerInner { let duration = SystemTime::now().duration_since(local_layer_mtime); match duration { Ok(elapsed) => { - timeline - .metrics - .evictions_with_low_residence_duration - .read() - .unwrap() - .observe(elapsed); + let accessed = self.access_stats.accessed(); + if accessed { + // Only layers used for reads contribute to our "low residence" metric that is used + // to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed + // to be rapidly evicted without contributing to this metric. + timeline + .metrics + .evictions_with_low_residence_duration + .read() + .unwrap() + .observe(elapsed); + } + tracing::info!( residence_millis = elapsed.as_millis(), + accessed, "evicted layer after known residence period" ); } @@ -1889,7 +1919,7 @@ impl ResidentLayer { self.owner.metadata() } - #[cfg(test)] + /// Cast the layer to a delta, return an error if it is an image layer. pub(crate) async fn get_as_delta( &self, ctx: &RequestContext, @@ -1901,7 +1931,7 @@ impl ResidentLayer { } } - #[cfg(test)] + /// Cast the layer to an image, return an error if it is a delta layer. pub(crate) async fn get_as_image( &self, ctx: &RequestContext, diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 3a7aca7a6c..8a3737f8a7 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -22,7 +22,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s async fn smoke_test() { let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("smoke_test").unwrap(); + let h = TenantHarness::create("smoke_test").await.unwrap(); let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let (tenant, _) = h.load().await; @@ -176,7 +176,9 @@ async fn evict_and_wait_on_wanted_deleted() { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap(); + let h = TenantHarness::create("evict_and_wait_on_wanted_deleted") + .await + .unwrap(); utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); let (tenant, ctx) = h.load().await; @@ -258,7 +260,9 @@ fn read_wins_pending_eviction() { rt.block_on(async move { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("read_wins_pending_eviction").unwrap(); + let h = TenantHarness::create("read_wins_pending_eviction") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); @@ -390,7 +394,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { rt.block_on(async move { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create(name).unwrap(); + let h = TenantHarness::create(name).await.unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); @@ -559,8 +563,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { #[tokio::test(start_paused = true)] async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let handle = tokio::runtime::Handle::current(); - let h = - TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap(); + let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let timeline = tenant @@ -636,7 +641,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { #[tokio::test(start_paused = true)] async fn evict_and_wait_does_not_wait_for_download() { // let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap(); + let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); @@ -733,7 +740,9 @@ async fn eviction_cancellation_on_drop() { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap(); + let h = TenantHarness::create("eviction_cancellation_on_drop") + .await + .unwrap(); utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); let (tenant, ctx) = h.load().await; diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index 68759f7585..eb4a1f28a1 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -96,15 +96,22 @@ impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> { impl<'a> std::cmp::Ord for IteratorWrapper<'a> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { use std::cmp::Ordering; - let a = self.peek_next_key_lsn(); - let b = other.peek_next_key_lsn(); + let a = self.peek_next_key_lsn_value(); + let b = other.peek_next_key_lsn_value(); match (a, b) { - (Some((k1, l1)), Some((k2, l2))) => { - let loaded_1 = if self.is_loaded() { 1 } else { 0 }; - let loaded_2 = if other.is_loaded() { 1 } else { 0 }; + (Some((k1, l1, v1)), Some((k2, l2, v2))) => { + fn map_value_to_num(val: &Option<&Value>) -> usize { + match val { + None => 0, + Some(Value::Image(_)) => 1, + Some(Value::WalRecord(_)) => 2, + } + } + let order_1 = map_value_to_num(&v1); + let order_2 = map_value_to_num(&v2); // When key_lsn are the same, the unloaded iter will always appear before the loaded one. // And note that we do a reverse at the end of the comparison, so it works with the max heap. - (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2)) + (k1, l1, order_1).cmp(&(k2, l2, order_2)) } (Some(_), None) => Ordering::Less, (None, Some(_)) => Ordering::Greater, @@ -137,13 +144,16 @@ impl<'a> IteratorWrapper<'a> { } } - fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> { + fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> { match self { - Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)), + Self::Loaded { iter } => iter + .peek() + .as_ref() + .map(|(key, lsn, val)| (key, *lsn, Some(val))), Self::NotLoaded { first_key_lower_bound: (key, lsn), .. - } => Some((key, *lsn)), + } => Some((key, *lsn, None)), } } @@ -191,6 +201,13 @@ impl<'a> IteratorWrapper<'a> { } } +/// A merge iterator over delta/image layer iterators. When duplicated records are +/// found, the iterator will not perform any deduplication, and the caller should handle +/// these situation. By saying duplicated records, there are many possibilities: +/// * Two same delta at the same LSN. +/// * Two same image at the same LSN. +/// * Delta/image at the same LSN where the image has already applied the delta. +/// The iterator will always put the image before the delta. pub struct MergeIterator<'a> { heap: BinaryHeap>, } @@ -245,8 +262,9 @@ mod tests { use crate::{ tenant::{ harness::{TenantHarness, TIMELINE_ID}, - storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}, + storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value}, }, + walrecord::NeonWalRecord, DEFAULT_PG_VERSION, }; @@ -275,7 +293,9 @@ mod tests { use crate::repository::Value; use bytes::Bytes; - let harness = TenantHarness::create("merge_iterator_merge_in_between").unwrap(); + let harness = TenantHarness::create("merge_iterator_merge_in_between") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant @@ -338,7 +358,9 @@ mod tests { use crate::repository::Value; use bytes::Bytes; - let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap(); + let harness = TenantHarness::create("merge_iterator_delta_merge") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant @@ -407,6 +429,133 @@ mod tests { // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge } - // TODO: image layer merge, delta+image mixed merge - // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that + #[tokio::test] + async fn delta_image_mixed_merge() { + use crate::repository::Value; + use bytes::Bytes; + + let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + // In this test case, we want to test if the iterator still works correctly with multiple copies + // of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab. + // Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix. + // An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation + // could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation + // one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should + // correctly process these situations and return everything as-is, and the upper layer of the system + // will handle duplicated LSNs. + let test_deltas1 = vec![ + ( + get_key(0), + Lsn(0x10), + Value::WalRecord(NeonWalRecord::wal_init()), + ), + ( + get_key(0), + Lsn(0x18), + Value::WalRecord(NeonWalRecord::wal_append("a")), + ), + ( + get_key(5), + Lsn(0x10), + Value::WalRecord(NeonWalRecord::wal_init()), + ), + ( + get_key(5), + Lsn(0x18), + Value::WalRecord(NeonWalRecord::wal_append("b")), + ), + ]; + let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) + .await + .unwrap(); + let mut test_deltas2 = test_deltas1.clone(); + test_deltas2.push(( + get_key(10), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + )); + let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) + .await + .unwrap(); + let test_deltas3 = vec![ + ( + get_key(0), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"")), + ), + ( + get_key(5), + Lsn(0x18), + Value::Image(Bytes::copy_from_slice(b"b")), + ), + ( + get_key(15), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ]; + let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) + .await + .unwrap(); + let mut test_deltas4 = test_deltas3.clone(); + test_deltas4.push(( + get_key(20), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + )); + let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx) + .await + .unwrap(); + let mut expect = Vec::new(); + expect.extend(test_deltas1); + expect.extend(test_deltas2); + expect.extend(test_deltas3); + expect.extend(test_deltas4); + expect.sort_by(sort_delta_value); + + // Test with different layer order for MergeIterator::create to ensure the order + // is stable. + + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_4.get_as_delta(&ctx).await.unwrap(), + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + resident_layer_3.get_as_delta(&ctx).await.unwrap(), + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + resident_layer_4.get_as_delta(&ctx).await.unwrap(), + resident_layer_3.get_as_delta(&ctx).await.unwrap(), + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + + is_send(merge_iter); + } + + fn is_send(_: impl Send) {} } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a3ddb3a1d1..19b1396981 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -69,6 +69,7 @@ use std::{ use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ + config::defaults::DEFAULT_PITR_INTERVAL, layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, storage_layer::PersistentLayerDesc, @@ -197,7 +198,7 @@ impl PartialOrd for Hole { /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. /// Can be removed after all refactors are done. -fn drop_rlock(rlock: tokio::sync::OwnedRwLockReadGuard) { +fn drop_rlock(rlock: tokio::sync::RwLockReadGuard) { drop(rlock) } @@ -270,7 +271,7 @@ pub struct Timeline { /// /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`, /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`. - pub(crate) layers: Arc>, + pub(crate) layers: tokio::sync::RwLock, last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. @@ -477,37 +478,32 @@ impl GcInfo { } } -/// The `GcInfo` component describing which Lsns need to be retained. +/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this +/// is a single number (the oldest LSN which we must retain), but it internally distinguishes +/// between time-based and space-based retention for observability and consumption metrics purposes. #[derive(Debug)] pub(crate) struct GcCutoffs { - /// Keep everything newer than this point. - /// - /// This is calculated by subtracting 'gc_horizon' setting from - /// last-record LSN - /// - /// FIXME: is this inclusive or exclusive? - pub(crate) horizon: Lsn, + /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much + /// history we must keep to retain a specified number of bytes of WAL. + pub(crate) space: Lsn, - /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this - /// point. - /// - /// This is calculated by finding a number such that a record is needed for PITR - /// if only if its LSN is larger than 'pitr_cutoff'. - pub(crate) pitr: Lsn, + /// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much + /// history we must keep to enable reading back at least the PITR interval duration. + pub(crate) time: Lsn, } impl Default for GcCutoffs { fn default() -> Self { Self { - horizon: Lsn::INVALID, - pitr: Lsn::INVALID, + space: Lsn::INVALID, + time: Lsn::INVALID, } } } impl GcCutoffs { fn select_min(&self) -> Lsn { - std::cmp::min(self.horizon, self.pitr) + std::cmp::min(self.space, self.time) } } @@ -866,7 +862,7 @@ impl Timeline { let gc_info = self.gc_info.read().unwrap(); let history = self .get_last_record_lsn() - .checked_sub(gc_info.cutoffs.pitr) + .checked_sub(gc_info.cutoffs.time) .unwrap_or(Lsn(0)) .0; (history, gc_info.within_ancestor_pitr) @@ -1565,7 +1561,7 @@ impl Timeline { ) -> anyhow::Result<()> { ensure!( lsn >= **latest_gc_cutoff_lsn, - "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", + "LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)", lsn, **latest_gc_cutoff_lsn, ); @@ -3408,6 +3404,7 @@ impl Timeline { } } + #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace /// @@ -4732,13 +4729,7 @@ impl Timeline { tenant: &crate::tenant::Tenant, options: detach_ancestor::Options, ctx: &RequestContext, - ) -> Result< - ( - completion::Completion, - detach_ancestor::PreparedTimelineDetach, - ), - detach_ancestor::Error, - > { + ) -> Result { detach_ancestor::prepare(self, tenant, options, ctx).await } @@ -4945,24 +4936,21 @@ impl Timeline { } /// Find the Lsns above which layer files need to be retained on - /// garbage collection. This is separate from actually performing the GC, - /// and is updated more frequently, so that compaction can remove obsolete - /// page versions more aggressively. + /// garbage collection. /// - /// TODO: that's wishful thinking, compaction doesn't actually do that - /// currently. + /// We calculate two cutoffs, one based on time and one based on WAL size. `pitr` + /// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls + /// the space-based retention. /// - /// The 'cutoff_horizon' point is used to retain recent versions that might still be - /// needed by read-only nodes. (As of this writing, the caller just passes - /// the latest LSN subtracted by a constant, and doesn't do anything smart - /// to figure out what read-only nodes might actually need.) - /// - /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine - /// whether a record is needed for PITR. + /// This function doesn't simply to calculate time & space based retention: it treats time-based + /// retention as authoritative if enabled, and falls back to space-based retention if calculating + /// the LSN for a time point isn't possible. Therefore the GcCutoffs::horizon in the response might + /// be different to the `space_cutoff` input. Callers should treat the min() of the two cutoffs + /// in the response as the GC cutoff point for the timeline. #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] pub(super) async fn find_gc_cutoffs( &self, - cutoff_horizon: Lsn, + space_cutoff: Lsn, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, @@ -4975,58 +4963,87 @@ impl Timeline { pausable_failpoint!("Timeline::find_gc_cutoffs-pausable"); - // First, calculate pitr_cutoff_timestamp and then convert it to LSN. - // - // Some unit tests depend on garbage-collection working even when - // CLOG data is missing, so that find_lsn_for_timestamp() doesn't - // work, so avoid calling it altogether if time-based retention is not - // configured. It would be pointless anyway. - let pitr_cutoff = if pitr != Duration::ZERO { - let now = SystemTime::now(); - if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { - let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - - match self - .find_lsn_for_timestamp(pitr_timestamp, cancel, ctx) - .await? - { - LsnForTimestamp::Present(lsn) => lsn, - LsnForTimestamp::Future(lsn) => { - // The timestamp is in the future. That sounds impossible, - // but what it really means is that there hasn't been - // any commits since the cutoff timestamp. - // - // In this case we should use the LSN of the most recent commit, - // which is implicitly the last LSN in the log. - debug!("future({})", lsn); - self.get_last_record_lsn() - } - LsnForTimestamp::Past(lsn) => { - debug!("past({})", lsn); - // conservative, safe default is to remove nothing, when we - // have no commit timestamp data available - *self.get_latest_gc_cutoff_lsn() - } - LsnForTimestamp::NoData(lsn) => { - debug!("nodata({})", lsn); - // conservative, safe default is to remove nothing, when we - // have no commit timestamp data available - *self.get_latest_gc_cutoff_lsn() - } - } - } else { - // If we don't have enough data to convert to LSN, - // play safe and don't remove any layers. - *self.get_latest_gc_cutoff_lsn() + if cfg!(test) { + // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup + if pitr == Duration::ZERO { + return Ok(GcCutoffs { + time: self.get_last_record_lsn(), + space: space_cutoff, + }); + } + } + + // Calculate a time-based limit on how much to retain: + // - if PITR interval is set, then this is our cutoff. + // - if PITR interval is not set, then we do a lookup + // based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases. + let time_cutoff = { + let now = SystemTime::now(); + let time_range = if pitr == Duration::ZERO { + humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") + } else { + pitr + }; + + // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case) + let time_cutoff = now.checked_sub(time_range).unwrap_or(now); + let timestamp = to_pg_timestamp(time_cutoff); + + match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? { + LsnForTimestamp::Present(lsn) => Some(lsn), + LsnForTimestamp::Future(lsn) => { + // The timestamp is in the future. That sounds impossible, + // but what it really means is that there hasn't been + // any commits since the cutoff timestamp. + // + // In this case we should use the LSN of the most recent commit, + // which is implicitly the last LSN in the log. + debug!("future({})", lsn); + Some(self.get_last_record_lsn()) + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + None + } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + None + } } - } else { - // No time-based retention was configured. Interpret this as "keep no history". - self.get_last_record_lsn() }; - Ok(GcCutoffs { - horizon: cutoff_horizon, - pitr: pitr_cutoff, + Ok(match (pitr, time_cutoff) { + (Duration::ZERO, Some(time_cutoff)) => { + // PITR is not set. Retain the size-based limit, or the default time retention, + // whichever requires less data. + GcCutoffs { + time: self.get_last_record_lsn(), + space: std::cmp::max(time_cutoff, space_cutoff), + } + } + (Duration::ZERO, None) => { + // PITR is not set, and time lookup failed + GcCutoffs { + time: self.get_last_record_lsn(), + space: space_cutoff, + } + } + (_, None) => { + // PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR + // cannot advance beyond what was already GC'd, and respect space-based retention + GcCutoffs { + time: *self.get_latest_gc_cutoff_lsn(), + space: space_cutoff, + } + } + (_, Some(time_cutoff)) => { + // PITR interval is set and we looked up timestamp successfully. Ignore + // size based retention and make time cutoff authoritative + GcCutoffs { + time: time_cutoff, + space: time_cutoff, + } + } }) } @@ -5051,11 +5068,11 @@ impl Timeline { return Err(GcError::TimelineCancelled); } - let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = { + let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = { let gc_info = self.gc_info.read().unwrap(); - let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn()); - let pitr_cutoff = gc_info.cutoffs.pitr; + let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn()); + let time_cutoff = gc_info.cutoffs.time; let retain_lsns = gc_info.retain_lsns.clone(); // Gets the maximum LSN that holds the valid lease. @@ -5065,14 +5082,14 @@ impl Timeline { let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn); ( - horizon_cutoff, - pitr_cutoff, + space_cutoff, + time_cutoff, retain_lsns, max_lsn_with_valid_lease, ) }; - let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff); let standby_horizon = self.standby_horizon.load(); // Hold GC for the standby, but as a safety guard do it only within some // reasonable lag. @@ -5101,8 +5118,8 @@ impl Timeline { let res = self .gc_timeline( - horizon_cutoff, - pitr_cutoff, + space_cutoff, + time_cutoff, retain_lsns, max_lsn_with_valid_lease, new_gc_cutoff, @@ -5120,8 +5137,8 @@ impl Timeline { async fn gc_timeline( &self, - horizon_cutoff: Lsn, - pitr_cutoff: Lsn, + space_cutoff: Lsn, + time_cutoff: Lsn, retain_lsns: Vec, max_lsn_with_valid_lease: Option, new_gc_cutoff: Lsn, @@ -5182,22 +5199,22 @@ impl Timeline { result.layers_total += 1; // 1. Is it newer than GC horizon cutoff point? - if l.get_lsn_range().end > horizon_cutoff { + if l.get_lsn_range().end > space_cutoff { debug!( - "keeping {} because it's newer than horizon_cutoff {}", + "keeping {} because it's newer than space_cutoff {}", l.layer_name(), - horizon_cutoff, + space_cutoff, ); result.layers_needed_by_cutoff += 1; continue 'outer; } // 2. It is newer than PiTR cutoff point? - if l.get_lsn_range().end > pitr_cutoff { + if l.get_lsn_range().end > time_cutoff { debug!( - "keeping {} because it's newer than pitr_cutoff {}", + "keeping {} because it's newer than time_cutoff {}", l.layer_name(), - pitr_cutoff, + time_cutoff, ); result.layers_needed_by_pitr += 1; continue 'outer; @@ -6029,8 +6046,9 @@ mod tests { #[tokio::test] async fn two_layer_eviction_attempts_at_the_same_time() { - let harness = - TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap(); + let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index efaa6144af..a648432b4d 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -26,9 +26,11 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; +use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD}; +use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc}; -use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome}; -use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter}; +use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; +use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::DeltaLayer; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; @@ -195,7 +197,7 @@ impl Timeline { tracing::info!( "latest_gc_cutoff: {}, pitr cutoff {}", *latest_gc_cutoff, - self.gc_info.read().unwrap().cutoffs.pitr + self.gc_info.read().unwrap().cutoffs.time ); let layers = self.layers.read().await; @@ -379,7 +381,7 @@ impl Timeline { }; let begin = tokio::time::Instant::now(); - let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await; + let phase1_layers_locked = self.layers.read().await; let now = tokio::time::Instant::now(); stats.read_lock_acquisition_micros = DurationRecorder::Recorded(RecordedDuration(now - begin), now); @@ -399,9 +401,9 @@ impl Timeline { } /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. - async fn compact_level0_phase1( - self: &Arc, - guard: tokio::sync::OwnedRwLockReadGuard, + async fn compact_level0_phase1<'a>( + self: &'a Arc, + guard: tokio::sync::RwLockReadGuard<'a, LayerManager>, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, ctx: &RequestContext, @@ -415,6 +417,7 @@ impl Timeline { .map(|x| guard.get_from_desc(&x)) .collect_vec(); stats.level0_deltas_count = Some(level0_deltas.len()); + // Only compact if enough layers have accumulated. let threshold = self.get_compaction_threshold(); if level0_deltas.is_empty() || level0_deltas.len() < threshold { @@ -445,6 +448,22 @@ impl Timeline { let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); + // Accumulate the size of layers in `deltas_to_compact` + let mut deltas_to_compact_bytes = 0; + + // Under normal circumstances, we will accumulate up to compaction_interval L0s of size + // checkpoint_distance each. To avoid edge cases using extra system resources, bound our + // work in this function to only operate on this much delta data at once. + // + // Take the max of the configured value & the default, so that tests that configure tiny values + // can still use a sensible amount of memory, but if a deployed system configures bigger values we + // still let them compact a full stack of L0s in one go. + let delta_size_limit = std::cmp::max( + self.get_compaction_threshold(), + DEFAULT_COMPACTION_THRESHOLD, + ) as u64 + * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE); + deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; @@ -453,7 +472,20 @@ impl Timeline { break; } deltas_to_compact.push(l.download_and_keep_resident().await?); + deltas_to_compact_bytes += l.metadata().file_size; prev_lsn_end = lsn_range.end; + + if deltas_to_compact_bytes >= delta_size_limit { + info!( + l0_deltas_selected = deltas_to_compact.len(), + l0_deltas_total = level0_deltas.len(), + "L0 compaction picker hit max delta layer size limit: {}", + delta_size_limit + ); + + // Proceed with compaction, but only a subset of L0s + break; + } } let lsn_range = Range { start: deltas_to_compact @@ -990,7 +1022,7 @@ impl Timeline { "enhanced legacy compaction currently does not support retain_lsns (branches)" ))); } - let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr); + let gc_cutoff = gc_info.cutoffs.select_min(); let mut selected_layers = Vec::new(); // TODO: consider retain_lsns drop(gc_info); @@ -1008,10 +1040,12 @@ impl Timeline { ); // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. // Also, collect the layer information to decide when to split the new delta layers. - let mut all_key_values = Vec::new(); + let mut downloaded_layers = Vec::new(); let mut delta_split_points = BTreeSet::new(); for layer in &layer_selection { - all_key_values.extend(layer.load_key_values(ctx).await?); + let resident_layer = layer.download_and_keep_resident().await?; + downloaded_layers.push(resident_layer); + let desc = layer.layer_desc(); if desc.is_delta() { // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon) @@ -1021,44 +1055,28 @@ impl Timeline { delta_split_points.insert(key_range.end); } } - // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and - // image layers, make image appear before than delta. - struct ValueWrapper<'a>(&'a crate::repository::Value); - impl Ord for ValueWrapper<'_> { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - use crate::repository::Value; - use std::cmp::Ordering; - match (self.0, other.0) { - (Value::Image(_), Value::WalRecord(_)) => Ordering::Less, - (Value::WalRecord(_), Value::Image(_)) => Ordering::Greater, - _ => Ordering::Equal, - } + let mut delta_layers = Vec::new(); + let mut image_layers = Vec::new(); + for resident_layer in &downloaded_layers { + if resident_layer.layer_desc().is_delta() { + let layer = resident_layer.get_as_delta(ctx).await?; + delta_layers.push(layer); + } else { + let layer = resident_layer.get_as_image(ctx).await?; + image_layers.push(layer); } } - impl PartialOrd for ValueWrapper<'_> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } - } - impl PartialEq for ValueWrapper<'_> { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == std::cmp::Ordering::Equal - } - } - impl Eq for ValueWrapper<'_> {} - all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| { - (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2))) - }); + let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx); // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas. // Data of the same key. let mut accumulated_values = Vec::new(); - let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty + let mut last_key: Option = None; /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon. async fn flush_accumulated_states( tline: &Arc, key: Key, - accumulated_values: &[&(Key, Lsn, crate::repository::Value)], + accumulated_values: &[(Key, Lsn, crate::repository::Value)], horizon: Lsn, ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> { let mut base_image = None; @@ -1159,7 +1177,7 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, - &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()), + &(Key::MIN..Key::MAX), // covers the full key range gc_cutoff, ctx, ) @@ -1169,20 +1187,24 @@ impl Timeline { let delta_split_points = delta_split_points.into_iter().collect_vec(); let mut current_delta_split_point = 0; let mut delta_layers = Vec::new(); - for item @ (key, _, _) in &all_key_values { - if &last_key == key { - accumulated_values.push(item); + while let Some((key, lsn, val)) = merge_iter.next().await? { + if last_key.is_none() || last_key.as_ref() == Some(&key) { + if last_key.is_none() { + last_key = Some(key); + } + accumulated_values.push((key, lsn, val)); } else { + let last_key = last_key.as_mut().unwrap(); let (deltas, image) = - flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff) + flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff) .await?; // Put the image into the image layer. Currently we have a single big layer for the compaction. - image_layer_writer.put_image(last_key, image, ctx).await?; + image_layer_writer.put_image(*last_key, image, ctx).await?; delta_values.extend(deltas); delta_layers.extend( flush_deltas( &mut delta_values, - last_key, + *last_key, &delta_split_points, &mut current_delta_split_point, self, @@ -1192,11 +1214,12 @@ impl Timeline { .await?, ); accumulated_values.clear(); - accumulated_values.push(item); - last_key = *key; + *last_key = key; + accumulated_values.push((key, lsn, val)); } } + let last_key = last_key.expect("no keys produced during compaction"); // TODO: move this part to the loop body let (deltas, image) = flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 4fc89330ba..49ce3db3e6 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -10,6 +10,7 @@ use crate::{ }, virtual_file::{MaybeFatalIo, VirtualFile}, }; +use pageserver_api::models::detach_ancestor::AncestorDetached; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; @@ -39,6 +40,9 @@ pub(crate) enum Error { #[error("unexpected error")] Unexpected(#[source] anyhow::Error), + + #[error("failpoint: {}", .0)] + Failpoint(&'static str), } impl From for ApiError { @@ -57,11 +61,41 @@ impl From for ApiError { | e @ Error::CopyDeltaPrefix(_) | e @ Error::UploadRewritten(_) | e @ Error::CopyFailed(_) - | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()), + | e @ Error::Unexpected(_) + | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()), } } } +impl From for Error { + fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self { + // treat all as shutting down signals, even though that is not entirely correct + // (uninitialized state) + Error::ShuttingDown + } +} + +impl From for Error { + fn from(value: FlushLayerError) -> Self { + match value { + FlushLayerError::Cancelled => Error::ShuttingDown, + FlushLayerError::NotRunning(_) => { + // FIXME(#6424): technically statically unreachable right now, given how we never + // drop the sender + Error::ShuttingDown + } + FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => { + Error::FlushAncestor(value) + } + } + } +} + +pub(crate) enum Progress { + Prepared(completion::Completion, PreparedTimelineDetach), + Done(AncestorDetached), +} + pub(crate) struct PreparedTimelineDetach { layers: Vec, } @@ -88,7 +122,7 @@ pub(super) async fn prepare( tenant: &Tenant, options: Options, ctx: &RequestContext, -) -> Result<(completion::Completion, PreparedTimelineDetach), Error> { +) -> Result { use Error::*; let Some((ancestor, ancestor_lsn)) = detached @@ -96,15 +130,67 @@ pub(super) async fn prepare( .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { - // TODO: check if we have already been detached; for this we need to read the stored data - // on remote client, for that we need a follow-up which makes uploads cheaper and maintains - // a projection of the commited data. + { + let accessor = detached.remote_client.initialized_upload_queue()?; + + // we are safe to inspect the latest uploaded, because we can only witness this after + // restart is complete and ancestor is no more. + let latest = accessor.latest_uploaded_index_part(); + if !latest.lineage.is_detached_from_original_ancestor() { + return Err(NoAncestor); + } + } + + // detached has previously been detached; let's inspect each of the current timelines and + // report back the timelines which have been reparented by our detach + let mut all_direct_children = tenant + .timelines + .lock() + .unwrap() + .values() + .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached))) + .map(|tl| (tl.ancestor_lsn, tl.clone())) + .collect::>(); + + let mut any_shutdown = false; + + all_direct_children.retain( + |(_, tl)| match tl.remote_client.initialized_upload_queue() { + Ok(accessor) => accessor + .latest_uploaded_index_part() + .lineage + .is_reparented(), + Err(_shutdownalike) => { + // not 100% a shutdown, but let's bail early not to give inconsistent results in + // sharded enviroment. + any_shutdown = true; + true + } + }, + ); + + if any_shutdown { + // it could be one or many being deleted; have client retry + return Err(Error::ShuttingDown); + } + + let mut reparented = all_direct_children; + // why this instead of hashset? there is a reason, but I've forgotten it many times. // - // the error is wrong per openapi - return Err(NoAncestor); + // maybe if this was a hashset we would not be able to distinguish some race condition. + reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id)); + + return Ok(Progress::Done(AncestorDetached { + reparented_timelines: reparented + .into_iter() + .map(|(_, tl)| tl.timeline_id) + .collect(), + })); }; if !ancestor_lsn.is_valid() { + // rare case, probably wouldn't even load + tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing"); return Err(NoAncestor); } @@ -131,6 +217,15 @@ pub(super) async fn prepare( let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable"); + + fail::fail_point!( + "timeline-detach-ancestor::before_starting_after_locking", + |_| Err(Error::Failpoint( + "timeline-detach-ancestor::before_starting_after_locking" + )) + ); + if ancestor_lsn >= ancestor.get_disk_consistent_lsn() { let span = tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id); @@ -151,7 +246,7 @@ pub(super) async fn prepare( } }; - res.map_err(FlushAncestor)?; + res?; // we do not need to wait for uploads to complete but we do need `struct Layer`, // copying delta prefix is unsupported currently for `InMemoryLayer`. @@ -159,7 +254,7 @@ pub(super) async fn prepare( elapsed_ms = started_at.elapsed().as_millis(), "froze and flushed the ancestor" ); - Ok(()) + Ok::<_, Error>(()) } .instrument(span) .await?; @@ -283,7 +378,7 @@ pub(super) async fn prepare( let prepared = PreparedTimelineDetach { layers: new_layers }; - Ok((guard, prepared)) + Ok(Progress::Prepared(guard, prepared)) } fn partition_work( @@ -350,7 +445,11 @@ async fn copy_lsn_prefix( target_timeline: &Arc, ctx: &RequestContext, ) -> Result, Error> { - use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed}; + use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown}; + + if target_timeline.cancel.is_cancelled() { + return Err(ShuttingDown); + } tracing::debug!(%layer, %end_lsn, "copying lsn prefix"); @@ -529,7 +628,7 @@ pub(super) async fn complete( match res { Ok(Some(timeline)) => { tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); - reparented.push(timeline.timeline_id); + reparented.push((timeline.ancestor_lsn, timeline.timeline_id)); } Ok(None) => { // lets just ignore this for now. one or all reparented timelines could had @@ -551,5 +650,12 @@ pub(super) async fn complete( tracing::info!("failed to reparent some candidates"); } + reparented.sort_unstable(); + + let reparented = reparented + .into_iter() + .map(|(_, timeline_id)| timeline_id) + .collect(); + Ok(reparented) } diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 1d2ffec08f..de50f217d8 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -1118,7 +1118,7 @@ mod tests { #[tokio::test] async fn no_connection_no_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("no_connection_no_candidate")?; + let harness = TenantHarness::create("no_connection_no_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1151,7 +1151,7 @@ mod tests { #[tokio::test] async fn connection_no_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("connection_no_candidate")?; + let harness = TenantHarness::create("connection_no_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1216,7 +1216,7 @@ mod tests { #[tokio::test] async fn no_connection_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("no_connection_candidate")?; + let harness = TenantHarness::create("no_connection_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1279,7 +1279,7 @@ mod tests { #[tokio::test] async fn candidate_with_many_connection_failures() -> anyhow::Result<()> { - let harness = TenantHarness::create("candidate_with_many_connection_failures")?; + let harness = TenantHarness::create("candidate_with_many_connection_failures").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1319,7 +1319,7 @@ mod tests { #[tokio::test] async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1385,7 +1385,8 @@ mod tests { #[tokio::test] async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?; + let harness = + TenantHarness::create("timeout_connection_threshold_current_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1448,7 +1449,7 @@ mod tests { #[tokio::test] async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?; + let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let new_lsn = Lsn(100_100).align(); @@ -1550,7 +1551,7 @@ mod tests { // and pageserver should prefer to connect to it. let test_az = Some("test_az".to_owned()); - let harness = TenantHarness::create("switch_to_same_availability_zone")?; + let harness = TenantHarness::create("switch_to_same_availability_zone").await?; let mut state = dummy_state(&harness).await; state.conf.availability_zone.clone_from(&test_az); let current_lsn = Lsn(100_000).align(); diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 50c977a950..f7440ecdae 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -228,18 +228,20 @@ impl UploadQueue { Ok(self.initialized_mut().expect("we just set it")) } - pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> { + pub(crate) fn initialized_mut( + &mut self, + ) -> Result<&mut UploadQueueInitialized, NotInitialized> { use UploadQueue::*; match self { - Uninitialized => Err(NotInitialized::Uninitialized.into()), + Uninitialized => Err(NotInitialized::Uninitialized), Initialized(x) => { if x.shutting_down { - Err(NotInitialized::ShuttingDown.into()) + Err(NotInitialized::ShuttingDown) } else { Ok(x) } } - Stopped(_) => Err(NotInitialized::Stopped.into()), + Stopped(_) => Err(NotInitialized::Stopped), } } diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 5a0986ea12..54a3ad789b 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -396,7 +396,6 @@ impl<'a> VectoredBlobReader<'a> { /// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for /// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and /// max_cnt constraints. -#[cfg(test)] pub struct StreamingVectoredReadPlanner { read_builder: Option, // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`] @@ -410,7 +409,6 @@ pub struct StreamingVectoredReadPlanner { cnt: usize, } -#[cfg(test)] impl StreamingVectoredReadPlanner { pub fn new(max_read_size: u64, max_cnt: usize) -> Self { assert!(max_cnt > 0); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 07c90385e6..dff3a8f52d 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1754,7 +1754,7 @@ mod tests { #[tokio::test] async fn test_relsize() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1975,7 +1975,10 @@ mod tests { // and then created it again within the same layer. #[tokio::test] async fn test_drop_extend() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_drop_extend") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -2046,7 +2049,10 @@ mod tests { // and then extended it again within the same layer. #[tokio::test] async fn test_truncate_extend() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_truncate_extend") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -2188,7 +2194,7 @@ mod tests { /// split into multiple 1 GB segments in Postgres. #[tokio::test] async fn test_large_rel() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_large_rel").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -2296,7 +2302,7 @@ mod tests { let startpoint = Lsn::from_hex("14AEC08").unwrap(); let _endpoint = Lsn::from_hex("1FFFF98").unwrap(); - let harness = TenantHarness::create("test_ingest_real_wal").unwrap(); + let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap(); let (tenant, ctx) = harness.load().await; let remote_initdb_path = diff --git a/patches/rum.patch b/patches/rum.patch new file mode 100644 index 0000000000..3041f8df81 --- /dev/null +++ b/patches/rum.patch @@ -0,0 +1,54 @@ +commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb +Author: Anastasia Lubennikova +Date: Mon Jul 15 12:31:56 2024 +0100 + + Neon: fix unlogged index build patch + +diff --git a/src/ruminsert.c b/src/ruminsert.c +index e8b209d..e89bf2a 100644 +--- a/src/ruminsert.c ++++ b/src/ruminsert.c +@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(index->rd_smgr); ++#endif ++ + initRumState(&buildstate.rumstate, index); + buildstate.rumstate.isBuild = true; + buildstate.indtuples = 0; +@@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); + rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(index->rd_smgr); ++#endif ++ + /* + * Write index to xlog + */ +@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + UnlockReleaseBuffer(buffer); + } + ++#ifdef NEON_SMGR ++ { ++#if PG_VERSION_NUM >= 160000 ++ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator; ++#else ++ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; ++#endif ++ ++ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ ++ smgr_end_unlogged_build(index->rd_smgr); ++ } ++#endif ++ + /* + * Return statistics + */ diff --git a/poetry.lock b/poetry.lock index 8091141411..5192a574cc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2641,19 +2641,18 @@ pbr = "*" [[package]] name = "setuptools" -version = "65.5.1" +version = "70.0.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"}, - {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"}, + {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, + {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 288f7769fe..2f18b5fbc6 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true +typed-json.workspace = true url.workspace = true urlencoding.workspace = true utils.workspace = true diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index cfc1f8e89e..543a458274 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -181,8 +181,9 @@ pub async fn worker( let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); let rx = rx.map(RequestData::from); - let storage = - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?; + let storage = GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?; let properties = WriterProperties::builder() .set_data_page_size_limit(config.parquet_upload_page_size) @@ -217,6 +218,7 @@ pub async fn worker( let storage_disconnect = GenericRemoteStorage::from_config(&disconnect_events_storage_config) + .await .context("remote storage for disconnect events init")?; let parquet_config_disconnect = parquet_config.clone(); tokio::try_join!( @@ -545,7 +547,9 @@ mod tests { }, timeout: std::time::Duration::from_secs(120), }; - let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap(); + let storage = GenericRemoteStorage::from_config(&remote_storage_config) + .await + .unwrap(); worker_inner(storage, rx, config).await.unwrap(); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 8118ae5ea8..6400e4ac7b 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -18,7 +18,7 @@ use hyper1::Response; use hyper1::StatusCode; use hyper1::{HeaderMap, Request}; use pq_proto::StartupMessageParamsBuilder; -use serde_json::json; +use serde::Serialize; use serde_json::Value; use tokio::time; use tokio_postgres::error::DbError; @@ -32,6 +32,7 @@ use tokio_postgres::Transaction; use tokio_util::sync::CancellationToken; use tracing::error; use tracing::info; +use typed_json::json; use url::Url; use utils::http::error::ApiError; @@ -263,13 +264,8 @@ pub async fn handle( | SqlOverHttpError::Postgres(e) => e.as_db_error(), _ => None, }; - fn get<'a, T: serde::Serialize>( - db: Option<&'a DbError>, - x: impl FnOnce(&'a DbError) -> T, - ) -> Value { - db.map(x) - .and_then(|t| serde_json::to_value(t).ok()) - .unwrap_or_default() + fn get<'a, T: Default>(db: Option<&'a DbError>, x: impl FnOnce(&'a DbError) -> T) -> T { + db.map(x).unwrap_or_default() } if let Some(db_error) = db_error { @@ -278,17 +274,11 @@ pub async fn handle( let position = db_error.and_then(|db| db.position()); let (position, internal_position, internal_query) = match position { - Some(ErrorPosition::Original(position)) => ( - Value::String(position.to_string()), - Value::Null, - Value::Null, - ), - Some(ErrorPosition::Internal { position, query }) => ( - Value::Null, - Value::String(position.to_string()), - Value::String(query.clone()), - ), - None => (Value::Null, Value::Null, Value::Null), + Some(ErrorPosition::Original(position)) => (Some(position.to_string()), None, None), + Some(ErrorPosition::Internal { position, query }) => { + (None, Some(position.to_string()), Some(query.clone())) + } + None => (None, None, None), }; let code = get(db_error, |db| db.code().code()); @@ -578,10 +568,8 @@ async fn handle_inner( .status(StatusCode::OK) .header(header::CONTENT_TYPE, "application/json"); - // - // Now execute the query and return the result - // - let result = match payload { + // Now execute the query and return the result. + let json_output = match payload { Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?, Payload::Batch(statements) => { if parsed_headers.txn_read_only { @@ -605,11 +593,9 @@ async fn handle_inner( let metrics = client.metrics(); - // how could this possibly fail - let body = serde_json::to_string(&result).expect("json serialization should not fail"); - let len = body.len(); + let len = json_output.len(); let response = response - .body(Full::new(Bytes::from(body))) + .body(Full::new(Bytes::from(json_output))) // only fails if invalid status code or invalid header/values are given. // these are not user configurable so it cannot fail dynamically .expect("building response payload should not fail"); @@ -631,7 +617,7 @@ impl QueryData { cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, - ) -> Result { + ) -> Result { let (inner, mut discard) = client.inner(); let cancel_token = inner.cancel_token(); @@ -644,7 +630,10 @@ impl QueryData { // The query successfully completed. Either::Left((Ok((status, results)), __not_yet_cancelled)) => { discard.check_idle(status); - Ok(results) + + let json_output = + serde_json::to_string(&results).expect("json serialization should not fail"); + Ok(json_output) } // The query failed with an error Either::Left((Err(e), __not_yet_cancelled)) => { @@ -662,7 +651,10 @@ impl QueryData { // query successed before it was cancelled. Ok(Ok((status, results))) => { discard.check_idle(status); - Ok(results) + + let json_output = serde_json::to_string(&results) + .expect("json serialization should not fail"); + Ok(json_output) } // query failed or was cancelled. Ok(Err(error)) => { @@ -696,7 +688,7 @@ impl BatchQueryData { cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, - ) -> Result { + ) -> Result { info!("starting transaction"); let (inner, mut discard) = client.inner(); let cancel_token = inner.cancel_token(); @@ -718,9 +710,9 @@ impl BatchQueryData { e })?; - let results = + let json_output = match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { - Ok(results) => { + Ok(json_output) => { info!("commit"); let status = transaction.commit().await.map_err(|e| { // if we cannot commit - for now don't return connection to pool @@ -729,7 +721,7 @@ impl BatchQueryData { e })?; discard.check_idle(status); - results + json_output } Err(SqlOverHttpError::Cancelled(_)) => { if let Err(err) = cancel_token.cancel_query(NoTls).await { @@ -753,7 +745,7 @@ impl BatchQueryData { } }; - Ok(json!({ "results": results })) + Ok(json_output) } } @@ -762,7 +754,7 @@ async fn query_batch( transaction: &Transaction<'_>, queries: BatchQueryData, parsed_headers: HttpHeaders, -) -> Result, SqlOverHttpError> { +) -> Result { let mut results = Vec::with_capacity(queries.queries.len()); let mut current_size = 0; for stmt in queries.queries { @@ -787,7 +779,11 @@ async fn query_batch( } } } - Ok(results) + + let results = json!({ "results": results }); + let json_output = serde_json::to_string(&results).expect("json serialization should not fail"); + + Ok(json_output) } async fn query_to_json( @@ -795,7 +791,7 @@ async fn query_to_json( data: QueryData, current_size: &mut usize, parsed_headers: HttpHeaders, -) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> { +) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> { info!("executing query"); let query_params = data.params; let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?); @@ -844,8 +840,8 @@ async fn query_to_json( for c in row_stream.columns() { fields.push(json!({ - "name": Value::String(c.name().to_owned()), - "dataTypeID": Value::Number(c.type_().oid().into()), + "name": c.name().to_owned(), + "dataTypeID": c.type_().oid(), "tableID": c.table_oid(), "columnID": c.column_id(), "dataTypeSize": c.type_size(), @@ -863,15 +859,14 @@ async fn query_to_json( .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode)) .collect::, _>>()?; - // resulting JSON format is based on the format of node-postgres result - Ok(( - ready, - json!({ - "command": command_tag_name, - "rowCount": command_tag_count, - "rows": rows, - "fields": fields, - "rowAsArray": array_mode, - }), - )) + // Resulting JSON format is based on the format of node-postgres result. + let results = json!({ + "command": command_tag_name.to_string(), + "rowCount": command_tag_count, + "rows": rows, + "fields": fields, + "rowAsArray": array_mode, + }); + + Ok((ready, results)) } diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 56ed2145dc..a8735fe0bb 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -357,11 +357,15 @@ pub async fn task_backup( info!("metrics backup has shut down"); } // Even if the remote storage is not configured, we still want to clear the metrics. - let storage = backup_config - .remote_storage_config - .as_ref() - .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init")) - .transpose()?; + let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() { + Some( + GenericRemoteStorage::from_config(config) + .await + .context("remote storage init")?, + ) + } else { + None + }; let mut ticker = tokio::time::interval(backup_config.interval); let mut prev = Utc::now(); let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index dd9058c468..b8bc3f3e06 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -12,13 +12,15 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } Ok(()) } - (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError( - format!( - "JWT scope '{:?}' is ineligible for Safekeeper auth", - claims.scope - ) - .into(), - )), + (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => { + Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Safekeeper auth", + claims.scope + ) + .into(), + )) + } (Scope::SafekeeperData, _) => Ok(()), } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 9eb6546d6b..2365fd0587 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -418,7 +418,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - wal_backup::init_remote_storage(&conf); + wal_backup::init_remote_storage(&conf).await; // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 14bd3c03b8..220988c3ce 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -74,10 +74,16 @@ pub async fn handle_request(request: Request) -> Result<()> { assert!(flush_lsn >= start_lsn); if request.until_lsn > flush_lsn { - bail!("requested LSN is beyond the end of the timeline"); + bail!(format!( + "requested LSN {} is beyond the end of the timeline {}", + request.until_lsn, flush_lsn + )); } if request.until_lsn < start_lsn { - bail!("requested LSN is before the start of the timeline"); + bail!(format!( + "requested LSN {} is before the start of the timeline {}", + request.until_lsn, start_lsn + )); } if request.until_lsn > commit_lsn { diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index af83feb77f..8f2920ada3 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -173,15 +173,6 @@ pub static BROKER_RUNTIME: Lazy = Lazy::new(|| { .expect("Failed to create broker runtime") }); -pub static WAL_REMOVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("WAL remover") - .worker_threads(1) - .enable_all() - .build() - .expect("Failed to create broker runtime") -}); - pub static WAL_BACKUP_RUNTIME: Lazy = Lazy::new(|| { tokio::runtime::Builder::new_multi_thread() .thread_name("WAL backup worker") @@ -189,12 +180,3 @@ pub static WAL_BACKUP_RUNTIME: Lazy = Lazy::new(|| { .build() .expect("Failed to create WAL backup runtime") }); - -pub static METRICS_SHIFTER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("metric shifter") - .worker_threads(1) - .enable_all() - .build() - .expect("Failed to create broker runtime") -}); diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 0b8d58ee8a..7947d83eb4 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -199,10 +199,7 @@ async fn redownload_partial_segment( file.flush().await?; let final_path = local_segment_path(mgr, partial); - info!( - "downloaded {} bytes, renaming to {}", - final_path, final_path, - ); + info!("downloaded {actual_len} bytes, renaming to {final_path}"); if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await { // Probably rename succeeded, but fsync of it failed. Remove // the file then to avoid using it. diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs index e249c859b4..dbdf46412d 100644 --- a/safekeeper/src/timeline_guard.rs +++ b/safekeeper/src/timeline_guard.rs @@ -4,7 +4,7 @@ use std::collections::HashSet; -use tracing::{debug, warn}; +use tracing::debug; use crate::timeline_manager::ManagerCtlMessage; @@ -23,7 +23,7 @@ impl Drop for ResidenceGuard { .manager_tx .send(ManagerCtlMessage::GuardDrop(self.guard_id)); if let Err(e) = res { - warn!("failed to send GuardDrop message: {:?}", e); + debug!("failed to send GuardDrop message: {:?}", e); } } } diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 5a590689c3..7ecee178f3 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -22,7 +22,7 @@ use tokio::fs::File; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::watch; +use tokio::sync::{watch, OnceCell}; use tokio::time::sleep; use tracing::*; @@ -33,8 +33,6 @@ use crate::timeline::{PeerInfo, WalResidentTimeline}; use crate::timeline_manager::{Manager, StateSnapshot}; use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME}; -use once_cell::sync::OnceCell; - const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; @@ -167,7 +165,7 @@ fn determine_offloader( } } -static REMOTE_STORAGE: OnceCell> = OnceCell::new(); +static REMOTE_STORAGE: OnceCell> = OnceCell::const_new(); // Storage must be configured and initialized when this is called. fn get_configured_remote_storage() -> &'static GenericRemoteStorage { @@ -178,14 +176,22 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage { .unwrap() } -pub fn init_remote_storage(conf: &SafeKeeperConf) { +pub async fn init_remote_storage(conf: &SafeKeeperConf) { // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide // dependencies to all tasks instead. - REMOTE_STORAGE.get_or_init(|| { - conf.remote_storage - .as_ref() - .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) - }); + REMOTE_STORAGE + .get_or_init(|| async { + if let Some(conf) = conf.remote_storage.as_ref() { + Some( + GenericRemoteStorage::from_config(conf) + .await + .expect("failed to create remote storage"), + ) + } else { + None + } + }) + .await; } struct WalBackupTask { diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 825851c97c..b1efa9749f 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -289,6 +289,18 @@ impl PartialBackup { }) .collect(); + if new_segments.len() == 1 { + // we have an uploaded segment, it must not be deleted from remote storage + segments_to_delete.retain(|name| name != &new_segments[0].name); + } else { + // there should always be zero or one uploaded segment + assert!( + new_segments.is_empty(), + "too many uploaded segments: {:?}", + new_segments + ); + } + info!("deleting objects: {:?}", segments_to_delete); let mut objects_to_delete = vec![]; for seg in segments_to_delete.iter() { diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml new file mode 100644 index 0000000000..c3bfe2bfd2 --- /dev/null +++ b/storage_controller/client/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "storage_controller_client" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +pageserver_api.workspace = true +pageserver_client.workspace = true +thiserror.workspace = true +async-trait.workspace = true +reqwest.workspace = true +utils.workspace = true +serde.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } +tokio-postgres.workspace = true +tokio-stream.workspace = true +tokio.workspace = true +futures.workspace = true +tokio-util.workspace = true +anyhow.workspace = true +postgres.workspace = true +bytes.workspace = true diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs new file mode 100644 index 0000000000..a981b5020e --- /dev/null +++ b/storage_controller/client/src/control_api.rs @@ -0,0 +1,62 @@ +use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; +use reqwest::{Method, Url}; +use serde::{de::DeserializeOwned, Serialize}; +use std::str::FromStr; + +pub struct Client { + base_url: Url, + jwt_token: Option, + client: reqwest::Client, +} + +impl Client { + pub fn new(base_url: Url, jwt_token: Option) -> Self { + Self { + base_url, + jwt_token, + client: reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"), + } + } + + /// Simple HTTP request wrapper for calling into storage controller + pub async fn dispatch( + &self, + method: Method, + path: String, + body: Option, + ) -> mgmt_api::Result + where + RQ: Serialize + Sized, + RS: DeserializeOwned + Sized, + { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let url = Url::from_str(&format!( + "http://{}:{}/{path}", + self.base_url.host_str().unwrap(), + self.base_url.port().unwrap() + )) + .unwrap(); + + let mut builder = self.client.request(method, url); + if let Some(body) = body { + builder = builder.json(&body) + } + if let Some(jwt_token) = &self.jwt_token { + builder = builder.header( + reqwest::header::AUTHORIZATION, + format!("Bearer {jwt_token}"), + ); + } + + let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?; + let response = response.error_from_body().await?; + + response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody) + } +} diff --git a/storage_controller/client/src/lib.rs b/storage_controller/client/src/lib.rs new file mode 100644 index 0000000000..6d5e202942 --- /dev/null +++ b/storage_controller/client/src/lib.rs @@ -0,0 +1 @@ +pub mod control_api; diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 3a62c0dd4f..8fb4be93e0 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -330,6 +330,22 @@ async fn handle_tenant_timeline_delete( .await } +async fn handle_tenant_timeline_detach_ancestor( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let res = service + .tenant_timeline_detach_ancestor(tenant_id, timeline_id) + .await?; + + json_response(StatusCode::OK, res) +} + async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, @@ -414,7 +430,7 @@ async fn handle_tenant_describe( service: Arc, req: Request, ) -> Result, ApiError> { - check_permissions(&req, Scope::Admin)?; + check_permissions(&req, Scope::Scrubber)?; let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) @@ -1006,6 +1022,16 @@ pub fn make_router( RequestName("v1_tenant_timeline"), ) }) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_detach_ancestor, + RequestName("v1_tenant_timeline_detach_ancestor"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index f1eb0b30fc..789f96beb3 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,11 +1,11 @@ use anyhow::{anyhow, Context}; -use camino::Utf8PathBuf; use clap::Parser; use diesel::Connection; use metrics::launch_timestamp::LaunchTimestamp; use metrics::BuildInfo; use std::path::PathBuf; use std::sync::Arc; +use std::time::Duration; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; @@ -51,10 +51,6 @@ struct Cli { #[arg(long)] compute_hook_url: Option, - /// Path to the .json file to store state (will be created if it doesn't exist) - #[arg(short, long)] - path: Option, - /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller #[arg(long)] database_url: Option, @@ -206,11 +202,10 @@ async fn async_main() -> anyhow::Result<()> { let args = Cli::parse(); tracing::info!( - "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}", + "version: {}, launch_timestamp: {}, build_tag {}, listening on {}", GIT_VERSION, launch_ts.to_string(), BUILD_TAG, - args.path.as_ref().unwrap_or(&Utf8PathBuf::from("")), args.listen ); @@ -277,8 +272,7 @@ async fn async_main() -> anyhow::Result<()> { .await .context("Running database migrations")?; - let json_path = args.path; - let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone())); + let persistence = Arc::new(Persistence::new(secrets.database_url)); let service = Service::spawn(config, persistence.clone()).await?; @@ -316,22 +310,23 @@ async fn async_main() -> anyhow::Result<()> { } tracing::info!("Terminating on signal"); - if json_path.is_some() { - // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing - // full postgres dumps around. - if let Err(e) = persistence.write_tenants_json().await { - tracing::error!("Failed to write JSON on shutdown: {e}") + // Stop HTTP server first, so that we don't have to service requests + // while shutting down Service. + server_shutdown.cancel(); + match tokio::time::timeout(Duration::from_secs(5), server_task).await { + Ok(Ok(_)) => { + tracing::info!("Joined HTTP server task"); + } + Ok(Err(e)) => { + tracing::error!("Error joining HTTP server task: {e}") + } + Err(_) => { + tracing::warn!("Timed out joining HTTP server task"); + // We will fall through and shut down the service anyway, any request handlers + // in flight will experience cancellation & their clients will see a torn connection. } } - // Stop HTTP server first, so that we don't have to service requests - // while shutting down Service - server_shutdown.cancel(); - if let Err(e) = server_task.await { - tracing::error!("Error joining HTTP server task: {e}") - } - tracing::info!("Joined HTTP server task"); - service.shutdown().await; tracing::info!("Service shutdown complete"); diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 769aba80ca..8d64201cd9 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,8 +1,9 @@ use pageserver_api::{ models::{ - LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, - TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, - TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, + detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, + PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, + TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, + TopTenantShardsRequest, TopTenantShardsResponse, }, shard::TenantShardId, }; @@ -226,6 +227,21 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_detach_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "timeline_detach_ancestor", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 9f7b2f775e..d8f31e86e5 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -5,8 +5,6 @@ use std::time::Duration; use std::time::Instant; use self::split_state::SplitState; -use camino::Utf8Path; -use camino::Utf8PathBuf; use diesel::pg::PgConnection; use diesel::prelude::*; use diesel::Connection; @@ -55,11 +53,6 @@ use crate::node::Node; /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline. pub struct Persistence { connection_pool: diesel::r2d2::Pool>, - - // In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of - // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward - // compatible just yet. - json_path: Option, } /// Legacy format, for use in JSON compat objects in test environment @@ -124,7 +117,7 @@ impl Persistence { const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10); const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60); - pub fn new(database_url: String, json_path: Option) -> Self { + pub fn new(database_url: String) -> Self { let manager = diesel::r2d2::ConnectionManager::::new(database_url); // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time @@ -139,10 +132,7 @@ impl Persistence { .build(manager) .expect("Could not build connection pool"); - Self { - connection_pool, - json_path, - } + Self { connection_pool } } /// A helper for use during startup, where we would like to tolerate concurrent restarts of the @@ -302,85 +292,13 @@ impl Persistence { /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { - let loaded = self - .with_measured_conn( - DatabaseOperation::ListTenantShards, - move |conn| -> DatabaseResult<_> { - Ok(crate::schema::tenant_shards::table.load::(conn)?) - }, - ) - .await?; - - if loaded.is_empty() { - if let Some(path) = &self.json_path { - if tokio::fs::try_exists(path) - .await - .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))? - { - tracing::info!("Importing from legacy JSON format at {path}"); - return self.list_tenant_shards_json(path).await; - } - } - } - Ok(loaded) - } - - /// Shim for automated compatibility tests: load tenants from a JSON file instead of database - pub(crate) async fn list_tenant_shards_json( - &self, - path: &Utf8Path, - ) -> DatabaseResult> { - let bytes = tokio::fs::read(path) - .await - .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?; - - let mut decoded = serde_json::from_slice::(&bytes) - .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?; - for shard in decoded.tenants.values_mut() { - if shard.placement_policy == "\"Single\"" { - // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165 - shard.placement_policy = "{\"Attached\":0}".to_string(); - } - - if shard.scheduling_policy.is_empty() { - shard.scheduling_policy = - serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap(); - } - } - - let tenants: Vec = decoded.tenants.into_values().collect(); - - // Synchronize database with what is in the JSON file - self.insert_tenant_shards(tenants.clone()).await?; - - Ok(tenants) - } - - /// For use in testing environments, where we dump out JSON on shutdown. - pub async fn write_tenants_json(&self) -> anyhow::Result<()> { - let Some(path) = &self.json_path else { - anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)"); - }; - tracing::info!("Writing state to {path}..."); - let tenants = self.list_tenant_shards().await?; - let mut tenants_map = HashMap::new(); - for tsp in tenants { - let tenant_shard_id = TenantShardId { - tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, - shard_number: ShardNumber(tsp.shard_number as u8), - shard_count: ShardCount::new(tsp.shard_count as u8), - }; - - tenants_map.insert(tenant_shard_id, tsp); - } - let json = serde_json::to_string(&JsonPersistence { - tenants: tenants_map, - })?; - - tokio::fs::write(path, &json).await?; - tracing::info!("Wrote {} bytes to {path}...", json.len()); - - Ok(()) + self.with_measured_conn( + DatabaseOperation::ListTenantShards, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::tenant_shards::table.load::(conn)?) + }, + ) + .await } /// Tenants must be persisted before we schedule them for the first time. This enables us diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index deaac83ea5..a163453dca 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -117,6 +117,7 @@ enum TenantOperations { TimelineCreate, TimelineDelete, AttachHook, + TimelineDetachAncestor, } #[derive(Clone, strum_macros::Display)] @@ -2376,18 +2377,18 @@ impl Service { tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); client - .tenant_time_travel_remote_storage( - tenant_shard_id, - ×tamp, - &done_if_after, - ) - .await - .map_err(|e| { - ApiError::InternalServerError(anyhow::anyhow!( - "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", - node - )) - })?; + .tenant_time_travel_remote_storage( + tenant_shard_id, + ×tamp, + &done_if_after, + ) + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", + node + )) + })?; } } Ok(()) @@ -2757,7 +2758,7 @@ impl Service { // Create timeline on remaining shards with number >0 if !targets.is_empty() { // If we had multiple shards, issue requests for the remainder now. - let jwt = self.config.jwt_token.clone(); + let jwt = &self.config.jwt_token; self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { let create_req = create_req.clone(); Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req)) @@ -2768,6 +2769,114 @@ impl Service { Ok(timeline_info) } + pub(crate) async fn tenant_timeline_detach_ancestor( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineDetachAncestor, + ) + .await; + + self.ensure_attached_wait(tenant_id).await?; + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let node_id = shard.intent.get_attached().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) + })?; + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + targets + }; + + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + + async fn detach_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { + tracing::info!( + "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); + + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + client + .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .await + .map_err(|e| { + use mgmt_api::Error; + + match e { + // no ancestor (ever) + Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!( + "{node}: {}", + msg.strip_prefix("Conflict: ").unwrap_or(&msg) + )), + // too many ancestors + Error::ApiError(StatusCode::BAD_REQUEST, msg) => { + ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}")) + } + // rest can be mapped + other => passthrough_api_error(&node, other), + } + }) + .map(|res| (tenant_shard_id.shard_number, res)) + } + + // no shard needs to go first/last; the operation should be idempotent + // TODO: it would be great to ensure that all shards return the same error + let mut results = self + .tenant_for_shards(targets, |tenant_shard_id, node| { + futures::FutureExt::boxed(detach_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + )) + }) + .await?; + + let any = results.pop().expect("we must have at least one response"); + + let mismatching = results + .iter() + .filter(|(_, res)| res != &any.1) + .collect::>(); + if !mismatching.is_empty() { + let matching = results.len() - mismatching.len(); + tracing::error!( + matching, + compared_against=?any, + ?mismatching, + "shards returned different results" + ); + } + + Ok(any.1) + } + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// /// On success, the returned vector contains exactly the same number of elements as the input `locations`. @@ -2894,8 +3003,8 @@ impl Service { .await .map_err(|e| { ApiError::InternalServerError(anyhow::anyhow!( - "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", - )) + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + )) }) } @@ -3847,6 +3956,8 @@ impl Service { "failpoint".to_string() ))); + failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel); + tracing::info!( "Split {} into {}", parent_id, diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index 050be66483..5233afbebe 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -34,6 +34,7 @@ camino.workspace = true rustls.workspace = true rustls-native-certs.workspace = true once_cell.workspace = true +storage_controller_client.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } chrono = { workspace = true, default-features = false, features = ["clock", "serde"] } diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 9102ad9906..a0b6d7ea30 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -24,6 +24,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; use pageserver::tenant::TENANTS_SEGMENT_NAME; use pageserver_api::shard::TenantShardId; +use remote_storage::RemotePath; use reqwest::Url; use serde::{Deserialize, Serialize}; use tokio::io::AsyncReadExt; @@ -31,7 +32,7 @@ use tracing::error; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; use utils::fs_ext; -use utils::id::{TenantId, TimelineId}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; const MAX_RETRIES: usize = 20; const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; @@ -54,7 +55,7 @@ pub struct S3Target { /// in the pageserver, as all timeline objects existing in the scope of a particular /// tenant: the scrubber is different in that it handles collections of data referring to many /// TenantShardTimelineIds in on place. -#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct TenantShardTimelineId { tenant_shard_id: TenantShardId, timeline_id: TimelineId, @@ -67,6 +68,10 @@ impl TenantShardTimelineId { timeline_id, } } + + fn as_tenant_timeline_id(&self) -> TenantTimelineId { + TenantTimelineId::new(self.tenant_shard_id.tenant_id, self.timeline_id) + } } impl Display for TenantShardTimelineId { @@ -179,6 +184,22 @@ impl RootTarget { .with_sub_segment(&id.timeline_id.to_string()) } + /// Given RemotePath "tenants/foo/timelines/bar/layerxyz", prefix it to a literal + /// key in the S3 bucket. + pub fn absolute_key(&self, key: &RemotePath) -> String { + let root = match self { + Self::Pageserver(root) => root, + Self::Safekeeper(root) => root, + }; + + let prefix = &root.prefix_in_bucket; + if prefix.ends_with('/') { + format!("{prefix}{key}") + } else { + format!("{prefix}/{key}") + } + } + pub fn bucket_name(&self) -> &str { match self { Self::Pageserver(root) => &root.bucket_name, @@ -216,6 +237,14 @@ impl BucketConfig { } } +pub struct ControllerClientConfig { + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + pub controller_api: Url, + + /// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'. + pub controller_jwt: String, +} + pub struct ConsoleConfig { pub token: String, pub base_url: Url, diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index d816121192..b3ed6f6451 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,11 +1,12 @@ -use anyhow::bail; +use anyhow::{anyhow, bail}; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; -use storage_scrubber::find_large_objects; +use reqwest::Url; use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use storage_scrubber::pageserver_physical_gc::GcMode; use storage_scrubber::scan_pageserver_metadata::scan_metadata; use storage_scrubber::tenant_snapshot::SnapshotDownloader; +use storage_scrubber::{find_large_objects, ControllerClientConfig}; use storage_scrubber::{ init_logging, pageserver_physical_gc::pageserver_physical_gc, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind, @@ -24,6 +25,14 @@ struct Cli { #[arg(short, long, default_value_t = false)] delete: bool, + + #[arg(long)] + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + controller_api: Option, + + #[arg(long)] + /// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'. + controller_jwt: Option, } #[derive(Subcommand, Debug)] @@ -204,8 +213,37 @@ async fn main() -> anyhow::Result<()> { min_age, mode, } => { - let summary = - pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?; + let controller_client_conf = cli.controller_api.map(|controller_api| { + ControllerClientConfig { + controller_api, + // Default to no key: this is a convenience when working in a development environment + controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()), + } + }); + + match (&controller_client_conf, mode) { + (Some(_), _) => { + // Any mode may run when controller API is set + } + (None, GcMode::Full) => { + // The part of physical GC where we erase ancestor layers cannot be done safely without + // confirming the most recent complete shard split with the controller. Refuse to run, rather + // than doing it unsafely. + return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run")); + } + (None, GcMode::DryRun | GcMode::IndicesOnly) => { + // These GcModes do not require the controller to run. + } + } + + let summary = pageserver_physical_gc( + bucket_config, + controller_client_conf, + tenant_ids, + min_age.into(), + mode, + ) + .await?; println!("{}", serde_json::to_string(&summary).unwrap()); Ok(()) } diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index fb8fbc1635..e977fd49f7 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -1,22 +1,50 @@ -use std::time::{Duration, UNIX_EPOCH}; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; use crate::checks::{list_timeline_blobs, BlobDataParseResult}; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; +use crate::{ + init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId, +}; use aws_sdk_s3::Client; use futures_util::{StreamExt, TryStreamExt}; -use pageserver::tenant::remote_timeline_client::parse_remote_index_path; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; -use pageserver_api::shard::TenantShardId; +use pageserver_api::controller_api::TenantDescribeResponse; +use pageserver_api::shard::{ShardIndex, TenantShardId}; use remote_storage::RemotePath; +use reqwest::Method; use serde::Serialize; +use storage_controller_client::control_api; use tracing::{info_span, Instrument}; use utils::generation::Generation; +use utils::id::{TenantId, TenantTimelineId}; #[derive(Serialize, Default)] pub struct GcSummary { indices_deleted: usize, remote_storage_errors: usize, + controller_api_errors: usize, + ancestor_layers_deleted: usize, +} + +impl GcSummary { + fn merge(&mut self, other: Self) { + let Self { + indices_deleted, + remote_storage_errors, + ancestor_layers_deleted, + controller_api_errors, + } = other; + + self.indices_deleted += indices_deleted; + self.remote_storage_errors += remote_storage_errors; + self.ancestor_layers_deleted += ancestor_layers_deleted; + self.controller_api_errors += controller_api_errors; + } } #[derive(clap::ValueEnum, Debug, Clone, Copy)] @@ -26,9 +54,9 @@ pub enum GcMode { // Enable only removing old-generation indices IndicesOnly, + // Enable all forms of GC - // TODO: this will be used when shard split ancestor layer deletion is added - // All, + Full, } impl std::fmt::Display for GcMode { @@ -36,10 +64,232 @@ impl std::fmt::Display for GcMode { match self { GcMode::DryRun => write!(f, "dry-run"), GcMode::IndicesOnly => write!(f, "indices-only"), + GcMode::Full => write!(f, "full"), } } } +mod refs { + use super::*; + // Map of cross-shard layer references, giving a refcount for each layer in each shard that is referenced by some other + // shard in the same tenant. This is sparse! The vast majority of timelines will have no cross-shard refs, and those that + // do have cross shard refs should eventually drop most of them via compaction. + // + // In our inner map type, the TTID in the key is shard-agnostic, and the ShardIndex in the value refers to the _ancestor + // which is is referenced_. + #[derive(Default)] + pub(super) struct AncestorRefs( + BTreeMap>, + ); + + impl AncestorRefs { + /// Insert references for layers discovered in a particular shard-timeline that refer to an ancestral shard-timeline. + pub(super) fn update( + &mut self, + ttid: TenantShardTimelineId, + layers: Vec<(LayerName, LayerFileMetadata)>, + ) { + let ttid_refs = self.0.entry(ttid.as_tenant_timeline_id()).or_default(); + for (layer_name, layer_metadata) in layers { + // Increment refcount of this layer in the ancestor shard + *(ttid_refs + .entry((layer_metadata.shard, layer_name)) + .or_default()) += 1; + } + } + + /// For a particular TTID, return the map of all ancestor layers referenced by a descendent to their refcount + /// + /// The `ShardIndex` in the result's key is the index of the _ancestor_, not the descendent. + pub(super) fn get_ttid_refcounts( + &self, + ttid: &TenantTimelineId, + ) -> Option<&HashMap<(ShardIndex, LayerName), usize>> { + self.0.get(ttid) + } + } +} + +use refs::AncestorRefs; + +// As we see shards for a tenant, acccumulate knowledge needed for cross-shard GC: +// - Are there any ancestor shards? +// - Are there any refs to ancestor shards' layers? +#[derive(Default)] +struct TenantRefAccumulator { + shards_seen: HashMap>, + + // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to + ancestor_ref_shards: AncestorRefs, +} + +impl TenantRefAccumulator { + fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) { + let this_shard_idx = ttid.tenant_shard_id.to_index(); + (*self + .shards_seen + .entry(ttid.tenant_shard_id.tenant_id) + .or_default()) + .push(this_shard_idx); + + let mut ancestor_refs = Vec::new(); + for (layer_name, layer_metadata) in &index_part.layer_metadata { + if layer_metadata.shard != this_shard_idx { + // This is a reference from this shard to a layer in an ancestor shard: we must track this + // as a marker to not GC this layer from the parent. + ancestor_refs.push((layer_name.clone(), layer_metadata.clone())); + } + } + + if !ancestor_refs.is_empty() { + tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len()); + self.ancestor_ref_shards.update(ttid, ancestor_refs); + } + } + + /// Consume Self and return a vector of ancestor tenant shards that should be GC'd, and map of referenced ancestor layers to preserve + async fn into_gc_ancestors( + self, + controller_client: &control_api::Client, + summary: &mut GcSummary, + ) -> (Vec, AncestorRefs) { + let mut ancestors_to_gc = Vec::new(); + for (tenant_id, mut shard_indices) in self.shards_seen { + // Find the highest shard count + let latest_count = shard_indices + .iter() + .map(|i| i.shard_count) + .max() + .expect("Always at least one shard"); + + let (mut latest_shards, ancestor_shards) = { + let at = + itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count); + (shard_indices[0..at].to_owned(), &shard_indices[at..]) + }; + // Sort shards, as we will later compare them with a sorted list from the controller + latest_shards.sort(); + + // Check that we have a complete view of the latest shard count: this should always be the case unless we happened + // to scan the S3 bucket halfway through a shard split. + if latest_shards.len() != latest_count.count() as usize { + // This should be extremely rare, so we warn on it. + tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count); + continue; + } + + // Check if we have any non-latest-count shards + if ancestor_shards.is_empty() { + tracing::debug!(%tenant_id, "No ancestor shards to clean up"); + continue; + } + + // Based on S3 view, this tenant looks like it might have some ancestor shard work to do. We + // must only do this work if the tenant is not currently being split: otherwise, it is not safe + // to GC ancestors, because if the split fails then the controller will try to attach ancestor + // shards again. + match controller_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await + { + Err(e) => { + // We were not able to learn the latest shard split state from the controller, so we will not + // do ancestor GC on this tenant. + tracing::warn!(%tenant_id, "Failed to query storage controller, will not do ancestor GC: {e}"); + summary.controller_api_errors += 1; + continue; + } + Ok(desc) => { + // We expect to see that the latest shard count matches the one we saw in S3, and that none + // of the shards indicate splitting in progress. + + let controller_indices: Vec = desc + .shards + .iter() + .map(|s| s.tenant_shard_id.to_index()) + .collect(); + if controller_indices != latest_shards { + tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})"); + continue; + } + + if desc.shards.iter().any(|s| s.is_splitting) { + tracing::info!(%tenant_id, "One or more shards is currently splitting"); + continue; + } + + // This shouldn't be too noisy, because we only log this for tenants that have some ancestral refs. + tracing::info!(%tenant_id, "Validated state with controller: {desc:?}"); + } + } + + // GC ancestor shards + for ancestor_shard in ancestor_shards.iter().map(|idx| TenantShardId { + tenant_id, + shard_count: idx.shard_count, + shard_number: idx.shard_number, + }) { + ancestors_to_gc.push(ancestor_shard); + } + } + + (ancestors_to_gc, self.ancestor_ref_shards) + } +} + +async fn is_old_enough( + s3_client: &Client, + bucket_config: &BucketConfig, + min_age: &Duration, + key: &str, + summary: &mut GcSummary, +) -> bool { + // Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident + // it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects. + let age: Duration = match s3_client + .head_object() + .bucket(&bucket_config.bucket) + .key(key) + .send() + .await + { + Ok(response) => match response.last_modified { + None => { + tracing::warn!("Missing last_modified"); + summary.remote_storage_errors += 1; + return false; + } + Some(last_modified) => match SystemTime::try_from(last_modified).map(|t| t.elapsed()) { + Ok(Ok(e)) => e, + Err(_) | Ok(Err(_)) => { + tracing::warn!("Bad last_modified time: {last_modified:?}"); + return false; + } + }, + }, + Err(e) => { + tracing::warn!("Failed to HEAD {key}: {e}"); + summary.remote_storage_errors += 1; + return false; + } + }; + let old_enough = &age > min_age; + + if !old_enough { + tracing::info!( + "Skipping young object {} < {}", + humantime::format_duration(age), + humantime::format_duration(*min_age) + ); + } + + old_enough +} + async fn maybe_delete_index( s3_client: &Client, bucket_config: &BucketConfig, @@ -79,45 +329,7 @@ async fn maybe_delete_index( return; } - // Validation: we will only delete indices after one week, so that during incidents we will have - // easy access to recent indices. - let age: Duration = match s3_client - .head_object() - .bucket(&bucket_config.bucket) - .key(key) - .send() - .await - { - Ok(response) => match response.last_modified { - None => { - tracing::warn!("Missing last_modified"); - summary.remote_storage_errors += 1; - return; - } - Some(last_modified) => { - let last_modified = - UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64()); - match last_modified.elapsed() { - Ok(e) => e, - Err(_) => { - tracing::warn!("Bad last_modified time: {last_modified:?}"); - return; - } - } - } - }, - Err(e) => { - tracing::warn!("Failed to HEAD {key}: {e}"); - summary.remote_storage_errors += 1; - return; - } - }; - if &age < min_age { - tracing::info!( - "Skipping young object {} < {}", - age.as_secs_f64(), - min_age.as_secs_f64() - ); + if !is_old_enough(s3_client, bucket_config, min_age, key, summary).await { return; } @@ -145,6 +357,108 @@ async fn maybe_delete_index( } } +#[allow(clippy::too_many_arguments)] +async fn gc_ancestor( + s3_client: &Client, + bucket_config: &BucketConfig, + root_target: &RootTarget, + min_age: &Duration, + ancestor: TenantShardId, + refs: &AncestorRefs, + mode: GcMode, + summary: &mut GcSummary, +) -> anyhow::Result<()> { + // Scan timelines in the ancestor + let timelines = stream_tenant_timelines(s3_client, root_target, ancestor).await?; + let mut timelines = std::pin::pin!(timelines); + + // Build a list of keys to retain + + while let Some(ttid) = timelines.next().await { + let ttid = ttid?; + + let data = list_timeline_blobs(s3_client, ttid, root_target).await?; + + let s3_layers = match data.blob_data { + BlobDataParseResult::Parsed { + index_part: _, + index_part_generation: _, + s3_layers, + } => s3_layers, + BlobDataParseResult::Relic => { + // Post-deletion tenant location: don't try and GC it. + continue; + } + BlobDataParseResult::Incorrect(reasons) => { + // Our primary purpose isn't to report on bad data, but log this rather than skipping silently + tracing::warn!( + "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}" + ); + continue; + } + }; + + let ttid_refs = refs.get_ttid_refcounts(&ttid.as_tenant_timeline_id()); + let ancestor_shard_index = ttid.tenant_shard_id.to_index(); + + for (layer_name, layer_gen) in s3_layers { + let ref_count = ttid_refs + .and_then(|m| m.get(&(ancestor_shard_index, layer_name.clone()))) + .copied() + .unwrap_or(0); + + if ref_count > 0 { + tracing::debug!(%ttid, "Ancestor layer {layer_name} has {ref_count} refs"); + continue; + } + + tracing::info!(%ttid, "Ancestor layer {layer_name} is not referenced"); + + // Build the key for the layer we are considering deleting + let key = root_target.absolute_key(&remote_layer_path( + &ttid.tenant_shard_id.tenant_id, + &ttid.timeline_id, + ancestor_shard_index, + &layer_name, + layer_gen, + )); + + // We apply a time threshold to GCing objects that are un-referenced: this preserves our ability + // to roll back a shard split if we have to, by avoiding deleting ancestor layers right away + if !is_old_enough(s3_client, bucket_config, min_age, &key, summary).await { + continue; + } + + if !matches!(mode, GcMode::Full) { + tracing::info!("Dry run: would delete key {key}"); + continue; + } + + // All validations passed: erase the object + match s3_client + .delete_object() + .bucket(&bucket_config.bucket) + .key(&key) + .send() + .await + { + Ok(_) => { + tracing::info!("Successfully deleted unreferenced ancestor layer {key}"); + summary.ancestor_layers_deleted += 1; + } + Err(e) => { + tracing::warn!("Failed to delete layer {key}: {e}"); + summary.remote_storage_errors += 1; + } + } + } + + // TODO: if all the layers are gone, clean up the whole timeline dir (remove index) + } + + Ok(()) +} + /// Physical garbage collection: removing unused S3 objects. This is distinct from the garbage collection /// done inside the pageserver, which operates at a higher level (keys, layers). This type of garbage collection /// is about removing: @@ -156,22 +470,26 @@ async fn maybe_delete_index( /// make sure that object listings don't get slowed down by large numbers of garbage objects. pub async fn pageserver_physical_gc( bucket_config: BucketConfig, - tenant_ids: Vec, + controller_client_conf: Option, + tenant_shard_ids: Vec, min_age: Duration, mode: GcMode, ) -> anyhow::Result { let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; - let tenants = if tenant_ids.is_empty() { + let tenants = if tenant_shard_ids.is_empty() { futures::future::Either::Left(stream_tenants(&s3_client, &target)) } else { - futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok))) + futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok))) }; // How many tenants to process in parallel. We need to be mindful of pageservers // accessing the same per tenant prefixes, so use a lower setting than pageservers. const CONCURRENCY: usize = 32; + // Accumulate information about each tenant for cross-shard GC step we'll do at the end + let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default())); + // Generate a stream of TenantTimelineId let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t)); let timelines = timelines.try_buffered(CONCURRENCY); @@ -185,16 +503,17 @@ pub async fn pageserver_physical_gc( target: &RootTarget, mode: GcMode, ttid: TenantShardTimelineId, + accumulator: &Arc>, ) -> anyhow::Result { let mut summary = GcSummary::default(); let data = list_timeline_blobs(s3_client, ttid, target).await?; - let (latest_gen, candidates) = match &data.blob_data { + let (index_part, latest_gen, candidates) = match &data.blob_data { BlobDataParseResult::Parsed { - index_part: _index_part, + index_part, index_part_generation, s3_layers: _s3_layers, - } => (*index_part_generation, data.unused_index_keys), + } => (index_part, *index_part_generation, data.unused_index_keys), BlobDataParseResult::Relic => { // Post-deletion tenant location: don't try and GC it. return Ok(summary); @@ -206,6 +525,8 @@ pub async fn pageserver_physical_gc( } }; + accumulator.lock().unwrap().update(ttid, index_part); + for key in candidates { maybe_delete_index( s3_client, @@ -222,17 +543,61 @@ pub async fn pageserver_physical_gc( Ok(summary) } - let timelines = timelines - .map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid)); - let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); let mut summary = GcSummary::default(); - while let Some(i) = timelines.next().await { - let tl_summary = i?; + // Drain futures for per-shard GC, populating accumulator as a side effect + { + let timelines = timelines.map_ok(|ttid| { + gc_timeline( + &s3_client, + &bucket_config, + &min_age, + &target, + mode, + ttid, + &accumulator, + ) + }); + let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); - summary.indices_deleted += tl_summary.indices_deleted; - summary.remote_storage_errors += tl_summary.remote_storage_errors; + while let Some(i) = timelines.next().await { + summary.merge(i?); + } + } + + // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC + let Some(controller_client) = controller_client_conf.as_ref().map(|c| { + let ControllerClientConfig { + controller_api, + controller_jwt, + } = c; + control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone())) + }) else { + tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified"); + return Ok(summary); + }; + + let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator) + .unwrap() + .into_inner() + .unwrap() + .into_gc_ancestors(&controller_client, &mut summary) + .await; + + for ancestor_shard in ancestor_shards { + gc_ancestor( + &s3_client, + &bucket_config, + &target, + &min_age, + ancestor_shard, + &ancestor_refs, + mode, + &mut summary, + ) + .instrument(info_span!("gc_ancestor", %ancestor_shard)) + .await?; } Ok(summary) diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c019cbbc77..4836d42db5 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -146,6 +146,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_smgr_query_seconds_sum", "pageserver_archive_size", "pageserver_pitr_history_size", + "pageserver_layer_bytes", + "pageserver_layer_count", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total", diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 39baf5fab6..658ed119a1 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -261,3 +261,47 @@ class NeonAPI: if op["status"] in {"scheduling", "running", "cancelling"}: has_running = True time.sleep(0.5) + + +class NeonApiEndpoint: + def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]): + self.neon_api = neon_api + if project_id is None: + project = neon_api.create_project(pg_version) + neon_api.wait_for_operation_to_finish(project["project"]["id"]) + self.project_id = project["project"]["id"] + self.endpoint_id = project["endpoints"][0]["id"] + self.connstr = project["connection_uris"][0]["connection_uri"] + self.pgbench_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + self.is_new = True + else: + project = neon_api.get_project_details(project_id) + if int(project["project"]["pg_version"]) != int(pg_version): + raise Exception( + f"A project with the provided ID exists, but it's not of the specified version (expected {pg_version}, got {project['project']['pg_version']})" + ) + self.project_id = project_id + eps = neon_api.get_endpoints(project_id)["endpoints"] + self.endpoint_id = eps[0]["id"] + self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[ + "uri" + ] + pw = self.connstr.split("@")[0].split(":")[-1] + self.pgbench_env = { + "PGHOST": eps[0]["host"], + "PGDATABASE": "neondb", + "PGUSER": "neondb_owner", + "PGPASSWORD": pw, + } + self.is_new = False + + def restart(self): + self.neon_api.restart_endpoint(self.project_id, self.endpoint_id) + self.neon_api.wait_for_operation_to_finish(self.project_id) + + def get_synthetic_storage_size(self) -> int: + return int( + self.neon_api.get_project_details(self.project_id)["project"]["synthetic_storage_size"] + ) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 463e4a3b01..9e39457c06 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -31,6 +31,7 @@ import backoff import httpx import jwt import psycopg2 +import psycopg2.sql import pytest import requests import toml @@ -87,7 +88,7 @@ from fixtures.utils import ( ) from fixtures.utils import AuxFileStore as AuxFileStore # reexport -from .neon_api import NeonAPI +from .neon_api import NeonAPI, NeonApiEndpoint """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -727,8 +728,30 @@ class NeonEnvBuilder: self.repo_dir / "local_fs_remote_storage", ) - if (attachments_json := Path(repo_dir / "attachments.json")).exists(): - shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name) + # restore storage controller (the db is small, don't bother with overlayfs) + storcon_db_from_dir = repo_dir / "storage_controller_db" + storcon_db_to_dir = self.repo_dir / "storage_controller_db" + log.info(f"Copying storage_controller_db from {storcon_db_from_dir} to {storcon_db_to_dir}") + assert storcon_db_from_dir.is_dir() + assert not storcon_db_to_dir.exists() + + def ignore_postgres_log(path: str, _names): + if Path(path) == storcon_db_from_dir: + return {"postgres.log"} + return set() + + shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log) + assert not (storcon_db_to_dir / "postgres.log").exists() + # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it. + # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller + # will currently reject re-attach requests from them because the NodeMetadata isn't identical. + # So, from_repo_dir patches up the the storcon database. + patch_script_path = self.repo_dir / "storage_controller_db.startup.sql" + assert not patch_script_path.exists() + patch_script = "" + for ps in self.env.pageservers: + patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';" + patch_script_path.write_text(patch_script) # Update the config with info about tenants and timelines with (self.repo_dir / "config").open("r") as f: @@ -974,7 +997,7 @@ class NeonEnvBuilder: if self.scrub_on_exit: try: - StorageScrubber(self).scan_metadata() + self.env.storage_scrubber.scan_metadata() except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -1135,6 +1158,7 @@ class NeonEnv: "listen_http_addr": f"localhost:{pageserver_port.http}", "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, + "image_compression": "zstd", } if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine @@ -1201,6 +1225,9 @@ class NeonEnv: ) cfg["safekeepers"].append(sk_cfg) + # Scrubber instance for tests that use it, and for use during teardown checks + self.storage_scrubber = StorageScrubber(self, log_dir=config.test_output_dir) + log.info(f"Config: {cfg}") self.neon_cli.init( cfg, @@ -2400,7 +2427,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ - :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int} + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} """ response = self.request( "GET", @@ -2786,8 +2813,8 @@ class NeonPageserver(PgProtocol, LogUtils): ) return client.tenant_attach( tenant_id, + generation, config, - generation=generation, ) def tenant_detach(self, tenant_id: TenantId): @@ -3158,6 +3185,18 @@ class RemotePostgres(PgProtocol): pass +@pytest.fixture(scope="function") +def benchmark_project_pub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint: + project_id = os.getenv("BENCHMARK_PROJECT_ID_PUB") + return NeonApiEndpoint(neon_api, pg_version, project_id) + + +@pytest.fixture(scope="function") +def benchmark_project_sub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint: + project_id = os.getenv("BENCHMARK_PROJECT_ID_SUB") + return NeonApiEndpoint(neon_api, pg_version, project_id) + + @pytest.fixture(scope="function") def remote_pg( test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion @@ -3773,12 +3812,12 @@ class Endpoint(PgProtocol, LogUtils): self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers ) - def respec(self, **kwargs): + def respec(self, **kwargs: Any) -> None: """Update the endpoint.json file used by control_plane.""" # Read config config_path = os.path.join(self.endpoint_path(), "endpoint.json") with open(config_path, "r") as f: - data_dict = json.load(f) + data_dict: dict[str, Any] = json.load(f) # Write it back updated with open(config_path, "w") as file: @@ -3786,13 +3825,13 @@ class Endpoint(PgProtocol, LogUtils): json.dump(dict(data_dict, **kwargs), file, indent=4) # Please note: Migrations only run if pg_skip_catalog_updates is false - def wait_for_migrations(self): + def wait_for_migrations(self, num_migrations: int = 10): with self.cursor() as cur: def check_migrations_done(): cur.execute("SELECT id FROM neon_migration.migration_id") - migration_id = cur.fetchall()[0][0] - assert migration_id != 0 + migration_id: int = cur.fetchall()[0][0] + assert migration_id >= num_migrations wait_until(20, 0.5, check_migrations_done) @@ -4042,6 +4081,22 @@ class Safekeeper(LogUtils): self.id = id self.running = running self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log" + + if extra_opts is None: + # Testing defaults: enable everything, and set short timeouts so that background + # work will happen during short tests. + # **Note**: Any test that explicitly sets extra_opts will not get these defaults. + extra_opts = [ + "--enable-offload", + "--delete-offloaded-wal", + "--partial-backup-timeout", + "10s", + "--control-file-save-interval", + "1s", + "--eviction-min-resident", + "10s", + ] + self.extra_opts = extra_opts def start( @@ -4213,9 +4268,9 @@ class Safekeeper(LogUtils): class StorageScrubber: - def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None): + def __init__(self, env: NeonEnv, log_dir: Path): self.env = env - self.log_dir = log_dir or env.test_output_dir + self.log_dir = log_dir def scrubber_cli(self, args: list[str], timeout) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) @@ -4232,11 +4287,14 @@ class StorageScrubber: if s3_storage.endpoint is not None: env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint}) - base_args = [str(self.env.neon_binpath / "storage_scrubber")] + base_args = [ + str(self.env.neon_binpath / "storage_scrubber"), + f"--controller-api={self.env.storage_controller_api}", + ] args = base_args + args (output_path, stdout, status_code) = subprocess_capture( - self.env.test_output_dir, + self.log_dir, args, echo_stderr=True, echo_stdout=True, @@ -4275,7 +4333,10 @@ class StorageScrubber: log.info(f"tenant-snapshot output: {stdout}") def pageserver_physical_gc( - self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None + self, + min_age_secs: int, + tenant_ids: Optional[list[TenantId]] = None, + mode: Optional[str] = None, ): args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"] @@ -4285,6 +4346,9 @@ class StorageScrubber: for tenant_id in tenant_ids: args.extend(["--tenant-id", str(tenant_id)]) + if mode is not None: + args.extend(["--mode", mode]) + stdout = self.scrubber_cli( args, timeout=30, diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 03aee9e5c5..c7cea4ec04 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -117,6 +117,9 @@ class LayerMapInfo: def image_layers(self) -> List[HistoricLayerInfo]: return [x for x in self.historic_layers if x.kind == "Image"] + def delta_l0_layers(self) -> List[HistoricLayerInfo]: + return [x for x in self.historic_layers if x.kind == "Delta" and x.l0] + def historic_by_name(self) -> Set[str]: return set(x.layer_file_name for x in self.historic_layers) @@ -172,6 +175,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter): if auth_token is not None: self.headers["Authorization"] = f"Bearer {auth_token}" + def without_status_retrying(self) -> PageserverHttpClient: + retries = Retry( + status=0, + connect=5, + read=False, + backoff_factor=0.2, + status_forcelist=[], + allowed_methods=None, + remove_headers_on_redirect=[], + ) + + return PageserverHttpClient( + self.port, self.is_testing_enabled_or_skip, self.auth_token, retries + ) + @property def base_url(self) -> str: return f"http://localhost:{self.port}" @@ -223,8 +241,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_attach( self, tenant_id: Union[TenantId, TenantShardId], + generation: int, config: None | Dict[str, Any] = None, - generation: Optional[int] = None, ): config = config or {} @@ -814,17 +832,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, batch_size: int | None = None, - ) -> Set[TimelineId]: + **kwargs, + ) -> List[TimelineId]: params = {} if batch_size is not None: params["batch_size"] = batch_size res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", params=params, + **kwargs, ) self.verbose_error(res) json = res.json() - return set(map(TimelineId, json["reparented_timelines"])) + return list(map(TimelineId, json["reparented_timelines"])) def evict_layer( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 60861cf939..949813c984 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -255,11 +255,3 @@ def run_pagebench_benchmark( unit="ms", report=MetricReport.LOWER_IS_BETTER, ) - - env.storage_controller.allowed_errors.append( - # The test setup swaps NeonEnv instances, hence different - # pg instances are used for the storage controller db. This means - # the storage controller doesn't know about the nodes mentioned - # in attachments.json at start-up. - ".* Scheduler missing node 1", - ) diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index 326c4f5c6f..077b76104c 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -2,6 +2,7 @@ from contextlib import closing import pytest from fixtures.compare_fixtures import NeonCompare +from fixtures.log_helper import log from fixtures.neon_fixtures import wait_for_last_flush_lsn @@ -56,3 +57,98 @@ def test_compaction(neon_compare: NeonCompare): pageserver_http.timeline_compact(tenant_id, timeline_id) neon_compare.report_size() + + +def test_compaction_l0_memory(neon_compare: NeonCompare): + """ + Generate a large stack of L0s pending compaction into L1s, and + measure the pageserver's peak RSS while doing so + """ + + env = neon_compare.env + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # Initially disable compaction so that we will build up a stack of L0s + "compaction_period": "0s", + "gc_period": "0s", + } + ) + neon_compare.tenant = tenant_id + neon_compare.timeline = timeline_id + + endpoint = env.endpoints.create_start( + "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"] + ) + + # Read tenant effective config and assert on checkpoint_distance and compaction_threshold, + # as we do want to test with defaults (to be same as the field), but this test's workload size makes assumptions about them. + # + # If these assertions fail, it probably means we changed the default. + tenant_conf = pageserver_http.tenant_config(tenant_id) + assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024 + assert tenant_conf.effective_config["compaction_threshold"] == 10 + + # Aim to write about 20 L0s, so that we will hit the limit on how many + # to compact at once + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + for i in range(200): + cur.execute(f"create table tbl{i} (i int, j int);") + cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);") + for j in range(100): + cur.execute(f"update tbl{i} set j = {j};") + + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + endpoint.stop() + + # Check we have generated the L0 stack we expected + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + initial_l0s = len(layers.delta_l0_layers()) + initial_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers()) + log.info(f"l0s before compaction {initial_l0s} ({initial_l0s_size})") + + def rss_hwm(): + v = pageserver_http.get_metric_value("libmetrics_maxrss_kb") + assert v is not None + assert v > 0 + return v * 1024 + + before = rss_hwm() + pageserver_http.timeline_compact(tenant_id, timeline_id) + after = rss_hwm() + + log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})") + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + final_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers()) + log.info(f"l0s after compaction {len(layers.delta_l0_layers())} ({final_l0s_size})") + + assert after > before # If we didn't use some memory the test is probably buggy + compaction_mapped_rss = after - before + + # During L0 compaction, we require as much memory as the physical size of what we compacted, and then some, + # because the key->value mapping in L0s compaction is exhaustive, non-streaming, and does not de-duplicate + # repeated references to the same key. + # + # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which + # this memory estimate can be revised far downwards to something that doesn't scale + # linearly with the layer sizes. + MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25 + + # If we find that compaction is using more memory, this may indicate a regression + assert compaction_mapped_rss < MEMORY_ESTIMATE + + # If we find that compaction is using <0.5 the expected memory then: + # - maybe we made a big efficiency improvement, in which case update the test + # - maybe something is functionally wrong with the test and it's not driving the system as expected + assert compaction_mapped_rss > MEMORY_ESTIMATE / 2 + + # We should have compacted some but not all of the l0s, based on the limit on how much + # l0 to compact in one go + assert len(layers.delta_l0_layers()) > 0 + assert len(layers.delta_l0_layers()) < initial_l0s + + # The pageserver should have logged when it hit the compaction size limit + env.pageserver.assert_log_contains(".*hit max delta layer size limit.*") diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 5ab83dd31d..53bb29a659 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -1,7 +1,6 @@ from __future__ import annotations import time -import traceback from typing import TYPE_CHECKING import psycopg2 @@ -10,15 +9,12 @@ import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_api import connection_parameters_to_env from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync -from fixtures.pg_version import PgVersion if TYPE_CHECKING: from fixtures.benchmark_fixture import NeonBenchmarker - from fixtures.neon_api import NeonAPI + from fixtures.neon_api import NeonApiEndpoint from fixtures.neon_fixtures import NeonEnv, PgBin - from fixtures.pg_version import PgVersion @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2]) @@ -86,8 +82,8 @@ def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600): @pytest.mark.timeout(2 * 60 * 60) def test_subscriber_lag( pg_bin: PgBin, - neon_api: NeonAPI, - pg_version: PgVersion, + benchmark_project_pub: NeonApiEndpoint, + benchmark_project_sub: NeonApiEndpoint, zenbenchmark: NeonBenchmarker, ): """ @@ -99,125 +95,82 @@ def test_subscriber_lag( sync_interval_min = 5 pgbench_duration = f"-T{test_duration_min * 60 * 2}" - pub_project = neon_api.create_project(pg_version) - pub_project_id = pub_project["project"]["id"] - neon_api.wait_for_operation_to_finish(pub_project_id) - error_occurred = False + pub_env = benchmark_project_pub.pgbench_env + sub_env = benchmark_project_sub.pgbench_env + pub_connstr = benchmark_project_pub.connstr + sub_connstr = benchmark_project_sub.connstr + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + if benchmark_project_pub.is_new: + pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history") + + if benchmark_project_sub.is_new: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1") + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() + sub_conn.close() + + zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) try: - sub_project = neon_api.create_project(pg_version) - sub_project_id = sub_project["project"]["id"] - sub_endpoint_id = sub_project["endpoints"][0]["id"] - neon_api.wait_for_operation_to_finish(sub_project_id) + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) try: - pub_env = connection_parameters_to_env( - pub_project["connection_uris"][0]["connection_parameters"] - ) - sub_env = connection_parameters_to_env( - sub_project["connection_uris"][0]["connection_parameters"] - ) - pub_connstr = pub_project["connection_uris"][0]["connection_uri"] - sub_connstr = sub_project["connection_uris"][0]["connection_uri"] + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) - pub_conn = psycopg2.connect(pub_connstr) - sub_conn = psycopg2.connect(sub_connstr) - pub_conn.autocommit = True - sub_conn.autocommit = True - with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - sub_cur.execute("truncate table pgbench_accounts") - sub_cur.execute("truncate table pgbench_history") + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + sub_workload.terminate() + benchmark_project_sub.restart() - pub_cur.execute( - "create publication pub1 for table pgbench_accounts, pgbench_history" - ) - sub_cur.execute( - f"create subscription sub1 connection '{pub_connstr}' publication pub1" - ) - - initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) - pub_conn.close() - sub_conn.close() - - zenbenchmark.record( - "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER - ) - - pub_workload = pg_bin.run_nonblocking( - ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env - ) - try: sub_workload = pg_bin.run_nonblocking( ["pgbench", "-c10", pgbench_duration, "-S"], env=sub_env, ) - try: - start = time.time() - while time.time() - start < test_duration_min * 60: - time.sleep(sync_interval_min * 60) - check_pgbench_still_running(pub_workload, "pub") - check_pgbench_still_running(sub_workload, "sub") - with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( - sub_connstr - ) as sub_conn: - with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - lag = measure_logical_replication_lag(sub_cur, pub_cur) - - log.info(f"Replica lagged behind master by {lag} seconds") - zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) - sub_workload.terminate() - neon_api.restart_endpoint( - sub_project_id, - sub_endpoint_id, - ) - neon_api.wait_for_operation_to_finish(sub_project_id) - sub_workload = pg_bin.run_nonblocking( - ["pgbench", "-c10", pgbench_duration, "-S"], - env=sub_env, - ) - - # Measure storage to make sure replication information isn't bloating storage - sub_storage = neon_api.get_project_details(sub_project_id)["project"][ - "synthetic_storage_size" - ] - pub_storage = neon_api.get_project_details(pub_project_id)["project"][ - "synthetic_storage_size" - ] - zenbenchmark.record( - "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER - ) - zenbenchmark.record( - "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER - ) - - finally: - sub_workload.terminate() - finally: - pub_workload.terminate() - except Exception as e: - error_occurred = True - log.error(f"Caught exception {e}") - log.error(traceback.format_exc()) + # Measure storage to make sure replication information isn't bloating storage + sub_storage = benchmark_project_sub.get_synthetic_storage_size() + pub_storage = benchmark_project_pub.get_synthetic_storage_size() + zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER) finally: - if not error_occurred: - neon_api.delete_project(sub_project_id) - except Exception as e: - error_occurred = True - log.error(f"Caught exception {e}") - log.error(traceback.format_exc()) + sub_workload.terminate() finally: - assert not error_occurred - neon_api.delete_project(pub_project_id) + pub_workload.terminate() @pytest.mark.remote_cluster @pytest.mark.timeout(2 * 60 * 60) def test_publisher_restart( pg_bin: PgBin, - neon_api: NeonAPI, - pg_version: PgVersion, + benchmark_project_pub: NeonApiEndpoint, + benchmark_project_sub: NeonApiEndpoint, zenbenchmark: NeonBenchmarker, ): """ @@ -229,114 +182,70 @@ def test_publisher_restart( sync_interval_min = 5 pgbench_duration = f"-T{test_duration_min * 60 * 2}" - pub_project = neon_api.create_project(pg_version) - pub_project_id = pub_project["project"]["id"] - pub_endpoint_id = pub_project["endpoints"][0]["id"] - neon_api.wait_for_operation_to_finish(pub_project_id) - error_occurred = False + pub_env = benchmark_project_pub.pgbench_env + sub_env = benchmark_project_sub.pgbench_env + pub_connstr = benchmark_project_pub.connstr + sub_connstr = benchmark_project_sub.connstr + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + if benchmark_project_pub.is_new: + pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history") + + if benchmark_project_sub.is_new: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1") + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() + sub_conn.close() + + zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) try: - sub_project = neon_api.create_project(pg_version) - sub_project_id = sub_project["project"]["id"] - neon_api.wait_for_operation_to_finish(sub_project_id) + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) try: - pub_env = connection_parameters_to_env( - pub_project["connection_uris"][0]["connection_parameters"] - ) - sub_env = connection_parameters_to_env( - sub_project["connection_uris"][0]["connection_parameters"] - ) - pub_connstr = pub_project["connection_uris"][0]["connection_uri"] - sub_connstr = sub_project["connection_uris"][0]["connection_uri"] + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) - - pub_conn = psycopg2.connect(pub_connstr) - sub_conn = psycopg2.connect(sub_connstr) - pub_conn.autocommit = True - sub_conn.autocommit = True - with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - sub_cur.execute("truncate table pgbench_accounts") - sub_cur.execute("truncate table pgbench_history") - - pub_cur.execute( - "create publication pub1 for table pgbench_accounts, pgbench_history" - ) - sub_cur.execute( - f"create subscription sub1 connection '{pub_connstr}' publication pub1" - ) - - initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) - pub_conn.close() - sub_conn.close() - - zenbenchmark.record( - "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER - ) - - pub_workload = pg_bin.run_nonblocking( - ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env - ) - try: - sub_workload = pg_bin.run_nonblocking( - ["pgbench", "-c10", pgbench_duration, "-S"], - env=sub_env, - ) - try: - start = time.time() - while time.time() - start < test_duration_min * 60: - time.sleep(sync_interval_min * 60) - check_pgbench_still_running(pub_workload, "pub") - check_pgbench_still_running(sub_workload, "sub") - - pub_workload.terminate() - neon_api.restart_endpoint( - pub_project_id, - pub_endpoint_id, - ) - neon_api.wait_for_operation_to_finish(pub_project_id) - pub_workload = pg_bin.run_nonblocking( - ["pgbench", "-c10", pgbench_duration, "-Mprepared"], - env=pub_env, - ) - with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( - sub_connstr - ) as sub_conn: - with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - lag = measure_logical_replication_lag(sub_cur, pub_cur) - - log.info(f"Replica lagged behind master by {lag} seconds") - zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) - - # Measure storage to make sure replication information isn't bloating storage - sub_storage = neon_api.get_project_details(sub_project_id)["project"][ - "synthetic_storage_size" - ] - pub_storage = neon_api.get_project_details(pub_project_id)["project"][ - "synthetic_storage_size" - ] - zenbenchmark.record( - "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER - ) - zenbenchmark.record( - "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER - ) - - finally: - sub_workload.terminate() - finally: pub_workload.terminate() - except Exception as e: - error_occurred = True - log.error(f"Caught exception {e}") - log.error(traceback.format_exc()) + benchmark_project_pub.restart() + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], + env=pub_env, + ) + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + + # Measure storage to make sure replication information isn't bloating storage + sub_storage = benchmark_project_sub.get_synthetic_storage_size() + pub_storage = benchmark_project_pub.get_synthetic_storage_size() + zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER) finally: - if not error_occurred: - neon_api.delete_project(sub_project_id) - except Exception as e: - error_occurred = True - log.error(f"Caught exception {e}") - log.error(traceback.format_exc()) + sub_workload.terminate() finally: - assert not error_occurred - neon_api.delete_project(pub_project_id) + pub_workload.terminate() diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 922a21a999..7cb85e3dd1 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): def check_pageserver(expect_success: bool, **conn_kwargs): check_connection( env.pageserver, - f"show {env.initial_tenant}", + f"pagestream {env.initial_tenant} {env.initial_timeline}", expect_success, **conn_kwargs, ) diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index eb503ddbfa..f2e3855c12 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -65,8 +65,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): "compaction_period": "1 s", "compaction_threshold": "2", "image_creation_threshold": "1", - # set PITR interval to be small, so we can do GC - "pitr_interval": "1 s", + # Disable PITR, this test will set an explicit space-based GC limit + "pitr_interval": "0 s", } ) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index f321c09b27..be787e0642 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -6,7 +6,10 @@ from typing import Optional import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + generate_uploads_and_deletions, +) from fixtures.pageserver.http import PageserverApiException from fixtures.utils import wait_until from fixtures.workload import Workload @@ -142,6 +145,10 @@ def test_sharding_compaction( "image_layer_creation_check_threshold": 0, } + # Disable compression, as we can't estimate the size of layers with compression enabled + # TODO: implement eager layer cutting during compaction + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, @@ -320,3 +327,87 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) or 0 ) == 0 assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*") + + +@pytest.mark.parametrize("enabled", [True, False]) +def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool): + tenant_conf = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers as eagerly as possible + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + } + + # Explicitly enable/disable compression, rather than using default + if enabled: + neon_env_builder.pageserver_config_override = "image_compression='zstd'" + else: + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageserver = env.pageserver + ps_http = env.pageserver.http_client() + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + # Generate around 800k worth of easily compressible data to store + for v in range(100): + endpoint.safe_psql( + f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))" + ) + # run compaction to create image layers + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + + layer_map = ps_http.layer_map_info(tenant_id, timeline_id) + image_layer_count = 0 + delta_layer_count = 0 + for layer in layer_map.historic_layers: + if layer.kind == "Image": + image_layer_count += 1 + elif layer.kind == "Delta": + delta_layer_count += 1 + assert image_layer_count > 0 + assert delta_layer_count > 0 + + log.info(f"images: {image_layer_count}, deltas: {delta_layer_count}") + + bytes_in = pageserver.http_client().get_metric_value( + "pageserver_compression_image_in_bytes_total" + ) + bytes_out = pageserver.http_client().get_metric_value( + "pageserver_compression_image_out_bytes_total" + ) + assert bytes_in is not None + assert bytes_out is not None + log.info(f"Compression ratio: {bytes_out/bytes_in} ({bytes_out} in, {bytes_out} out)") + + if enabled: + # We are writing high compressible repetitive plain text, expect excellent compression + EXPECT_RATIO = 0.2 + assert bytes_out / bytes_in < EXPECT_RATIO + else: + # Nothing should be compressed if we disabled it. + assert bytes_out >= bytes_in + + # Destroy the endpoint and create a new one to resetthe caches + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + for v in range(100): + res = endpoint.safe_psql( + f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)" + ) + assert res[0][0] == 1 diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 1e5e320e0e..65649e0c0a 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -93,29 +93,6 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( ) -def fixup_storage_controller(env: NeonEnv): - """ - After importing a repo_dir, we need to massage the storage controller's state a bit: it will have - initially started up with no nodes, but some tenants, and thereby those tenants won't be scheduled - anywhere. - - After NeonEnv.start() is done (i.e. nodes are started + registered), call this function to get - the storage controller into a good state. - - This function should go away once compat tests carry the controller database in their snapshots, so - that the controller properly remembers nodes between creating + restoring the snapshot. - """ - env.storage_controller.allowed_errors.extend( - [ - ".*Tenant shard .+ references non-existent node.*", - ".*Failed to schedule tenant .+ at startup.*", - ] - ) - env.storage_controller.stop() - env.storage_controller.start() - env.storage_controller.reconcile_until_idle() - - @pytest.mark.xdist_group("compatibility") @pytest.mark.order(before="test_forward_compatibility") def test_create_snapshot( @@ -198,7 +175,6 @@ def test_backward_compatibility( neon_env_builder.num_safekeepers = 3 env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo") neon_env_builder.start() - fixup_storage_controller(env) check_neon_works( env, @@ -287,7 +263,6 @@ def test_forward_compatibility( assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version) neon_env_builder.start() - fixup_storage_controller(env) # ensure the specified pageserver is running assert env.pageserver.log_contains("git-env:" + prev_pageserver_version) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index fb8b7b22fa..3c834f430b 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -230,6 +230,9 @@ def _eviction_env( neon_env_builder.num_pageservers = num_pageservers neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + # Disable compression support for EvictionEnv to get larger layer sizes + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + # initial tenant will not be present on this pageserver env = neon_env_builder.init_configs() env.start() diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index 91bd3ea50c..bdc5ca907e 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -1,6 +1,10 @@ -import time +from __future__ import annotations -from fixtures.neon_fixtures import NeonEnv +import time +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv def test_migrations(neon_simple_env: NeonEnv): @@ -11,17 +15,14 @@ def test_migrations(neon_simple_env: NeonEnv): endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() - endpoint.wait_for_migrations() - num_migrations = 10 + endpoint.wait_for_migrations(num_migrations=num_migrations) with endpoint.cursor() as cur: cur.execute("SELECT id FROM neon_migration.migration_id") migration_id = cur.fetchall() assert migration_id[0][0] == num_migrations - endpoint.assert_log_contains(f"INFO handle_migrations: Ran {num_migrations} migrations") - endpoint.stop() endpoint.start() # We don't have a good way of knowing that the migrations code path finished executing @@ -31,5 +32,3 @@ def test_migrations(neon_simple_env: NeonEnv): cur.execute("SELECT id FROM neon_migration.migration_id") migration_id = cur.fetchall() assert migration_id[0][0] == num_migrations - - endpoint.assert_log_contains("INFO handle_migrations: Ran 0 migrations") diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 7ce38c5c3c..041942cda3 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -22,7 +22,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgBin, - StorageScrubber, generate_uploads_and_deletions, ) from fixtures.pageserver.common_types import parse_layer_file_name @@ -215,7 +214,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = StorageScrubber(neon_env_builder).scan_metadata() + metadata_summary = env.storage_scrubber.scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index cea35a6acb..24a37b04ec 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -58,7 +58,6 @@ def test_metric_collection( metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" metric_collection_bucket={remote_storage_to_toml_inline_table(neon_env_builder.pageserver_remote_storage)} - cached_metric_collection_interval="0s" synthetic_size_calculation_interval="3s" """ @@ -216,7 +215,6 @@ def test_metric_collection_cleans_up_tempfile( neon_env_builder.pageserver_config_override = f""" metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" - cached_metric_collection_interval="0s" synthetic_size_calculation_interval="3s" """ diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 0416078ebc..58d61eab0d 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -7,7 +7,7 @@ from typing import Any, Dict, Optional import pytest from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubber +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, @@ -234,7 +234,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check # that the scrubber sees it and cleans it up. We do this before the final attach+validate pass, # to also validate that the scrubber isn't breaking anything. - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] > 0 @@ -555,7 +555,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - StorageScrubber(neon_env_builder).scan_metadata() + env.storage_scrubber.scan_metadata() # Detach secondary and delete tenant # =================================== diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 54b493ec70..d5b5ac3f75 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -117,7 +117,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End # Run the main PostgreSQL regression tests, in src/test/regress. # -@pytest.mark.timeout(600) +@pytest.mark.timeout(900) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) def test_pg_regress( neon_env_builder: NeonEnvBuilder, @@ -186,6 +186,7 @@ def test_pg_regress( # Run the PostgreSQL "isolation" tests, in src/test/isolation. # +@pytest.mark.timeout(600) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) def test_isolation( neon_env_builder: NeonEnvBuilder, diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index fac7fe9dee..09f941f582 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -577,7 +577,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( > 0 ) - wait_until(20, 0.1, assert_compacted_and_uploads_queued) + wait_until(200, 0.1, assert_compacted_and_uploads_queued) # Regardless, give checkpoint some time to block for good. # Not strictly necessary, but might help uncover failure modes in the future. @@ -619,7 +619,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( ) # timeline deletion should be unblocking checkpoint ops - checkpoint_thread.join(2.0) + checkpoint_thread.join(20.0) assert not checkpoint_thread.is_alive() # Just to be sure, unblock ongoing uploads. If the previous assert was incorrect, or the prometheus metric broken, diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 4471237900..90c6e26d01 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -12,7 +12,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, StorageControllerApiException, - StorageScrubber, last_flush_lsn_upload, tenant_get_shards, wait_for_last_flush_lsn, @@ -128,7 +127,7 @@ def test_sharding_smoke( # Check the scrubber isn't confused by sharded content, then disable # it during teardown because we'll have deleted by then - StorageScrubber(neon_env_builder).scan_metadata() + env.storage_scrubber.scan_metadata() neon_env_builder.scrub_on_exit = False env.storage_controller.pageserver_api().tenant_delete(tenant_id) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 35ae61c380..635690fc7f 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -1,14 +1,19 @@ import os import shutil +import threading +import time +from concurrent.futures import ThreadPoolExecutor from typing import Optional import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.log_helper import log from fixtures.neon_fixtures import ( + NeonEnv, NeonEnvBuilder, - StorageScrubber, ) from fixtures.remote_storage import S3Storage, s3_storage +from fixtures.utils import wait_until from fixtures.workload import Workload @@ -60,8 +65,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: output_path = neon_env_builder.test_output_dir / "snapshot" os.makedirs(output_path) - scrubber = StorageScrubber(neon_env_builder) - scrubber.tenant_snapshot(tenant_id, output_path) + env.storage_scrubber.tenant_snapshot(tenant_id, output_path) assert len(os.listdir(output_path)) > 0 @@ -111,6 +115,14 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: workload.validate() +def drop_local_state(env: NeonEnv, tenant_id: TenantId): + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.reconcile_until_idle() + + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + + @pytest.mark.parametrize("shard_count", [None, 4]) def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) @@ -133,28 +145,231 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt # For each cycle, detach and attach the tenant to bump the generation, and do some writes to generate uploads for _i in range(0, n_cycles): - env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) - env.storage_controller.reconcile_until_idle() - - env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) - env.storage_controller.reconcile_until_idle() + drop_local_state(env, tenant_id) # This write includes remote upload, will generate an index in this generation workload.write_rows(1) # With a high min_age, the scrubber should decline to delete anything - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600) + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == 0 # If targeting a different tenant, the scrubber shouldn't do anything - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc( + gc_summary = env.storage_scrubber.pageserver_physical_gc( min_age_secs=1, tenant_ids=[TenantId.generate()] ) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == 0 # With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count + + +@pytest.mark.parametrize("shard_count", [None, 2]) +def test_scrubber_physical_gc_ancestors( + neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] +): + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=shard_count, + conf={ + # Small layers and low compaction thresholds, so that when we split we can expect some to + # be dropped by child shards + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + # Disable background compaction, we will do it explicitly + "compaction_period": "0s", + # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas + # and makes them GC'able + "pitr_interval": "0s", + }, + ) + + # Make sure the original shard has some layers + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + + new_shard_count = 4 + assert shard_count is None or new_shard_count > shard_count + shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) + + # Make sure child shards have some layers + workload.write_rows(100) + + # Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once + # a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even + # if they were logically deleted before the shard split, just not physically deleted yet because of the queue. + for ps in env.pageservers: + ps.http_client().deletion_queue_flush(execute=True) + + # Before compacting, all the layers in the ancestor should still be referenced by the children: the scrubber + # should not erase any ancestor layers + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] == 0 + + # Write some data and compact: compacting, some ancestor layers should no longer be needed by children + # (the compaction is part of the checkpoint that Workload does for us) + workload.churn_rows(100) + workload.churn_rows(100) + workload.churn_rows(100) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + ps.http_client().timeline_compact(shard, timeline_id) + ps.http_client().timeline_gc(shard, timeline_id, 0) + + # We will use a min_age_secs=1 threshold for deletion, let it pass + time.sleep(2) + + # Our time threshold should be respected: check that with a high threshold we delete nothing + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] == 0 + + # Now run with a low time threshold: deletions of ancestor layers should be executed + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] > 0 + + # We deleted some layers: now check we didn't corrupt the tenant by doing so. Detach and + # attach it, to drop any local state, then check it's still readable. + workload.stop() + drop_local_state(env, tenant_id) + + workload.validate() + + +def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): + """ + Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards + which don't reference an ancestor, but some child shards that don't exist yet, then we do not incorrectly + GC any ancestor layers. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + initial_shard_count = 2 + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=initial_shard_count, + conf={ + # Small layers and low compaction thresholds, so that when we split we can expect some to + # be dropped by child shards + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + # Disable background compaction, we will do it explicitly + "compaction_period": "0s", + # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas + # and makes them GC'able + "pitr_interval": "0s", + }, + ) + + unstuck = threading.Event() + + def stuck_split(): + # Pause our shard split after the first shard but before the second, such that when we run + # the scrub, the S3 bucket contains shards 0002, 0101, 0004, 0204 (but not 0104, 0304). + env.storage_controller.configure_failpoints( + ("shard-split-post-remote-sleep", "return(3600000)") + ) + try: + split_response = env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + except Exception as e: + log.info(f"Split failed with {e}") + else: + if not unstuck.is_set(): + raise RuntimeError(f"Split succeeded unexpectedly ({split_response})") + + with ThreadPoolExecutor(max_workers=1) as threads: + log.info("Starting hung shard split") + stuck_split_fut = threads.submit(stuck_split) + + # Let the controller reach the failpoint + wait_until( + 10, + 1, + lambda: env.storage_controller.assert_log_contains( + 'failpoint "shard-split-post-remote-sleep": sleeping' + ), + ) + + # Run compaction on the new child shards, so that they drop some refs to their parent + child_shards = [ + TenantShardId(tenant_id, 0, 4), + TenantShardId(tenant_id, 2, 4), + ] + log.info("Compacting first two children") + for child in child_shards: + env.get_tenant_pageserver( + TenantShardId(tenant_id, 0, initial_shard_count) + ).http_client().timeline_compact(child, timeline_id) + + # Check that the other child shards weren't created + assert env.get_tenant_pageserver(TenantShardId(tenant_id, 1, 4)) is None + assert env.get_tenant_pageserver(TenantShardId(tenant_id, 3, 4)) is None + + # Run scrubber: it should not incorrectly interpret the **04 shards' lack of refs to all + # ancestor layers as a reason to GC them, because it should realize that a split is in progress. + # (GC requires that controller does not indicate split in progress, and that if we see the highest + # shard count N, then there are N shards present with that shard count). + gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + log.info(f"Ran physical GC partway through split: {gc_output}") + assert gc_output["ancestor_layers_deleted"] == 0 + assert gc_output["remote_storage_errors"] == 0 + assert gc_output["controller_api_errors"] == 0 + + # Storage controller shutdown lets our split request client complete + log.info("Stopping storage controller") + unstuck.set() + env.storage_controller.allowed_errors.append(".*Timed out joining HTTP server task.*") + env.storage_controller.stop() + stuck_split_fut.result() + + # Restart the controller and retry the split with the failpoint disabled, this should + # complete successfully and result in an S3 state that allows the scrubber to proceed with removing ancestor layers + log.info("Starting & retrying split") + env.storage_controller.start() + env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + + # The other child shards exist now, we can compact them to drop refs to ancestor + log.info("Compacting second two children") + for child in [ + TenantShardId(tenant_id, 1, 4), + TenantShardId(tenant_id, 3, 4), + ]: + env.get_tenant_pageserver(child).http_client().timeline_compact(child, timeline_id) + + gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + log.info(f"Ran physical GC after split completed: {gc_output}") + assert gc_output["ancestor_layers_deleted"] > 0 + assert gc_output["remote_storage_errors"] == 0 + assert gc_output["controller_api_errors"] == 0 diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 80fb2b55b8..9fb7324fa1 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -1,16 +1,14 @@ import json -from contextlib import closing from typing import Any, Dict -import psycopg2.extras from fixtures.common_types import Lsn -from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import wait_until +from fixtures.workload import Workload def test_tenant_config(neon_env_builder: NeonEnvBuilder): @@ -63,25 +61,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): # check the configuration of the default tenant # it should match global configuration - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - log.info(f"show {env.initial_tenant}") - pscur.execute(f"show {env.initial_tenant}") - res = pscur.fetchone() - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 10000, - "compaction_target_size": 1048576, - "compaction_period": 20, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 60 * 60, - "image_creation_threshold": 3, - "pitr_interval": 604800, # 7 days - }.items() - ), f"Unexpected res: {res}" default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant) assert ( not default_tenant_config.tenant_specific_overrides @@ -103,25 +82,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): } # check the configuration of the new tenant - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 20000, - "compaction_target_size": 1048576, - "compaction_period": 20, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 30, - "image_creation_threshold": 3, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" new_tenant_config = http_client.tenant_config(tenant_id=tenant) new_specific_config = new_tenant_config.tenant_specific_overrides assert new_specific_config["checkpoint_distance"] == 20000 @@ -166,25 +126,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): conf=conf_update, ) - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after config res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 15000, - "compaction_target_size": 1048576, - "compaction_period": 80, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 80, - "image_creation_threshold": 2, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" updated_tenant_config = http_client.tenant_config(tenant_id=tenant) updated_specific_config = updated_tenant_config.tenant_specific_overrides assert updated_specific_config["checkpoint_distance"] == 15000 @@ -222,25 +163,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start() - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after restart res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 15000, - "compaction_target_size": 1048576, - "compaction_period": 80, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 80, - "image_creation_threshold": 2, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" restarted_tenant_config = http_client.tenant_config(tenant_id=tenant) assert ( restarted_tenant_config == updated_tenant_config @@ -283,19 +205,10 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start() - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after restart res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "compaction_period": 20, - "pitr_interval": 60, - }.items() - ), f"Unexpected res: {res}" + restarted_final_tenant_config = http_client.tenant_config(tenant_id=tenant) + assert ( + restarted_final_tenant_config == final_tenant_config + ), "Updated config should not change after the restart" def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): @@ -353,6 +266,13 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( (tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline ps_http = env.pageserver.http_client() + # When we evict/download layers, we will use this Workload to generate getpage requests + # that touch some layers, as otherwise the pageserver doesn't report totally unused layers + # as problems when they have short residence duration. + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + def get_metric(): metrics = ps_http.get_metrics() metric = metrics.query_one( @@ -373,6 +293,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert default_value == "1day" ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.value) > 0, "metric is updated" @@ -393,6 +314,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert int(metric.value) == 0 ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60 @@ -406,6 +328,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert int(metric.value) == 0, "value resets if label changes" ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60 diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 1d7c8b8e31..6d20b3d0de 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -5,7 +5,6 @@ from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, - StorageScrubber, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException @@ -325,7 +324,6 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) remote_storage_kind = RemoteStorageKind.MOCK_S3 neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - scrubber = StorageScrubber(neon_env_builder) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) ps_http = env.pageserver.http_client() @@ -340,7 +338,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) env.stop() - result = scrubber.scan_metadata() + result = env.storage_scrubber.scan_metadata() assert result["with_warnings"] == [] env.start() @@ -348,5 +346,5 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) ps_http.tenant_delete(tenant_id) env.stop() - scrubber.scan_metadata() + env.storage_scrubber.scan_metadata() assert result["with_warnings"] == [] diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 04b3fdd80f..0ebf714de0 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -45,17 +45,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): # Failure to write a config to local disk makes the pageserver assume that local disk is bad and abort the process pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) - # Storage controller will see a torn TCP connection when the crash point is reached, and follow an unclean 500 error path - neon_simple_env.storage_controller.allowed_errors.extend( - [ - ".*Reconcile not done yet while creating tenant.*", - ".*Reconcile error: receive body: error sending request.*", - ".*Error processing HTTP request: InternalServerError.*", - ] - ) + tenant_id = TenantId.generate() - with pytest.raises(Exception, match="error sending request"): - _ = neon_simple_env.neon_cli.create_tenant() + with pytest.raises(requests.exceptions.ConnectionError, match="Connection aborted"): + neon_simple_env.pageserver.http_client().tenant_attach(tenant_id=tenant_id, generation=1) # Any files left behind on disk during failed creation do not prevent # a retry from succeeding. Restart pageserver with no failpoints. diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 606ce203cd..38f8dfa885 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1,5 +1,7 @@ import datetime import enum +import threading +import time from concurrent.futures import ThreadPoolExecutor from queue import Empty, Queue from threading import Barrier @@ -9,14 +11,17 @@ import pytest from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( + LogCursor, NeonEnvBuilder, PgBin, + flush_ep_to_pageserver, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException -from fixtures.pageserver.utils import wait_timeline_detail_404 -from fixtures.remote_storage import LocalFsStorage -from fixtures.utils import assert_pageserver_backups_equal +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404 +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind +from fixtures.utils import assert_pageserver_backups_equal, wait_until +from requests import ReadTimeout def by_end_lsn(info: HistoricLayerInfo) -> Lsn: @@ -160,7 +165,7 @@ def test_ancestor_detach_branched_from( ) all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert all_reparented == set() + assert all_reparented == [] if restart_after: env.pageserver.stop() @@ -269,7 +274,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None) all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert all_reparented == {reparented, same_branchpoint} + assert set(all_reparented) == {reparented, same_branchpoint} env.pageserver.quiesce_tenants() @@ -529,7 +534,7 @@ def test_compaction_induced_by_detaches_in_history( for _, timeline_id in skip_main: reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert reparented == set(), "we have no earlier branches at any level" + assert reparented == [], "we have no earlier branches at any level" post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total" @@ -559,23 +564,49 @@ def test_compaction_induced_by_detaches_in_history( assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set()) -def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() - env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) +@pytest.mark.parametrize("sharded", [True, False]) +def test_timeline_ancestor_detach_idempotent_success( + neon_env_builder: NeonEnvBuilder, sharded: bool +): + shards = 2 if sharded else 1 - client = env.pageserver.http_client() + neon_env_builder.num_pageservers = shards + env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None) - with pytest.raises(PageserverApiException, match=".* no ancestors") as info: - client.detach_ancestor(env.initial_tenant, env.initial_timeline) - assert info.value.status_code == 409 + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for ps in pageservers.values(): + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + if sharded: + # FIXME: should this be in the neon_env_builder.init_start? + env.storage_controller.reconcile_until_idle() + client = env.storage_controller.pageserver_api() + else: + client = env.pageserver.http_client() first_branch = env.neon_cli.create_branch("first_branch") - second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") - # funnily enough this does not have a prefix - with pytest.raises(PageserverApiException, match="too many ancestors") as info: - client.detach_ancestor(env.initial_tenant, second_branch) - assert info.value.status_code == 400 + _ = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + + # these two will be reparented, and they should be returned in stable order + # from pageservers OR otherwise there will be an `error!` logging from + # storage controller + reparented1 = env.neon_cli.create_branch("first_reparented", ancestor_branch_name="main") + reparented2 = env.neon_cli.create_branch("second_reparented", ancestor_branch_name="main") + + first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch) + assert set(first_reparenting_response) == {reparented1, reparented2} + + # FIXME: this should be done by the http req handler + for ps in pageservers.values(): + ps.quiesce_tenants() + + for _ in range(5): + # once completed, we can retry this how many times + assert ( + client.detach_ancestor(env.initial_tenant, first_branch) == first_reparenting_response + ) client.tenant_delete(env.initial_tenant) @@ -584,10 +615,446 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): assert e.value.status_code == 404 +@pytest.mark.parametrize("sharded", [True, False]) +def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, sharded: bool): + # the test is split from test_timeline_ancestor_detach_idempotent_success as only these error cases should create "request was dropped before completing", + # given the current first error handling + shards = 2 if sharded else 1 + + neon_env_builder.num_pageservers = shards + env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for ps in pageservers.values(): + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + ps.allowed_errors.append( + ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing" + ) + + client = ( + env.pageserver.http_client() if not sharded else env.storage_controller.pageserver_api() + ) + + with pytest.raises(PageserverApiException, match=".* no ancestors") as info: + client.detach_ancestor(env.initial_tenant, env.initial_timeline) + assert info.value.status_code == 409 + + _ = env.neon_cli.create_branch("first_branch") + + second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + + # funnily enough this does not have a prefix + with pytest.raises(PageserverApiException, match="too many ancestors") as info: + client.detach_ancestor(env.initial_tenant, second_branch) + assert info.value.status_code == 400 + + +def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): + """ + Sharded timeline detach ancestor; 4 nodes: 1 stuck, 1 restarted, 2 normal. + + Stuck node gets stuck on a pause failpoint for first storage controller request. + Restarted node remains stuck until explicit restart from test code. + + We retry the request until storage controller gets 200 OK from all nodes. + """ + branch_name = "soon_detached" + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + # FIXME: should this be in the neon_env_builder.init_start? + env.storage_controller.reconcile_until_idle() + # as we will stop a node, make sure there is no clever rebalancing + env.storage_controller.tenant_policy_update(env.initial_tenant, body={"scheduling": "Stop"}) + env.storage_controller.allowed_errors.append(".*: Scheduling is disabled by policy Stop .*") + + shards = env.storage_controller.locate(env.initial_tenant) + + utilized_pageservers = {x["node_id"] for x in shards} + assert len(utilized_pageservers) > 1, "all shards got placed on single pageserver?" + + branch_timeline_id = env.neon_cli.create_branch(branch_name, tenant_id=env.initial_tenant) + + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + ep.safe_psql( + "create table foo as select 1::bigint, i::bigint from generate_series(1, 10000) v(i)" + ) + lsn = flush_ep_to_pageserver(env, ep, env.initial_tenant, branch_timeline_id) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for shard_info in shards: + node_id = int(shard_info["node_id"]) + shard_id = shard_info["shard_id"] + detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id) + + assert Lsn(detail["last_record_lsn"]) >= lsn + assert Lsn(detail["initdb_lsn"]) < lsn + assert TimelineId(detail["ancestor_timeline_id"]) == env.initial_timeline + + # make one of the nodes get stuck, but continue the initial operation + # make another of the nodes get stuck, then restart + + stuck = pageservers[int(shards[0]["node_id"])] + log.info(f"stuck pageserver is id={stuck.id}") + stuck_http = stuck.http_client() + stuck_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause") + ) + + restarted = pageservers[int(shards[1]["node_id"])] + log.info(f"restarted pageserver is id={restarted.id}") + # this might be hit; see `restart_restarted` + restarted.allowed_errors.append(".*: Cancelled request finished with an error: ShuttingDown") + assert restarted.id != stuck.id + restarted_http = restarted.http_client() + restarted_http.configure_failpoints( + [ + ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause"), + ] + ) + + for info in shards: + pageserver = pageservers[int(info["node_id"])] + # the first request can cause these, but does not repeatedly + pageserver.allowed_errors.append(".*: request was dropped before completing") + + # first request again + env.storage_controller.allowed_errors.append(".*: request was dropped before completing") + + target = env.storage_controller.pageserver_api() + + with pytest.raises(ReadTimeout): + target.detach_ancestor(env.initial_tenant, branch_timeline_id, timeout=1) + + stuck_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off") + ) + + barrier = threading.Barrier(2) + + def restart_restarted(): + barrier.wait() + # graceful shutdown should just work, because simultaneously unpaused + restarted.stop() + # this does not happen always, depends how fast we exit after unpausing + # restarted.assert_log_contains("Cancelled request finished with an error: ShuttingDown") + restarted.start() + + with ThreadPoolExecutor(max_workers=1) as pool: + fut = pool.submit(restart_restarted) + barrier.wait() + # we have 10s, lets use 1/2 of that to help the shutdown start + time.sleep(5) + restarted_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off") + ) + fut.result() + + # detach ancestor request handling is not sensitive to http cancellation. + # this means that the "stuck" is on its way to complete the detach, but the restarted is off + # now it can either be complete on all nodes, or still in progress with + # one. + without_retrying = target.without_status_retrying() + + # this retry loop will be long enough that the tenant can always activate + reparented = None + for _ in range(10): + try: + reparented = without_retrying.detach_ancestor(env.initial_tenant, branch_timeline_id) + except PageserverApiException as info: + assert info.status_code == 503 + time.sleep(2) + else: + break + + assert reparented == [], "too many retries (None) or unexpected reparentings" + + for shard_info in shards: + node_id = int(shard_info["node_id"]) + shard_id = shard_info["shard_id"] + + # TODO: ensure quescing is done on pageserver? + pageservers[node_id].quiesce_tenants() + detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id) + wait_for_last_record_lsn( + pageservers[node_id].http_client(), shard_id, branch_timeline_id, lsn + ) + assert detail.get("ancestor_timeline_id") is None + + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + count = int(ep.safe_psql("select count(*) from foo")[0][0]) + assert count == 10000 + + +@pytest.mark.parametrize("mode", ["delete_timeline", "delete_tenant"]) +@pytest.mark.parametrize("sharded", [False, True]) +def test_timeline_detach_ancestor_interrupted_by_deletion( + neon_env_builder: NeonEnvBuilder, mode: str, sharded: bool +): + """ + Timeline ancestor detach interrupted by deleting either: + - the detached timeline + - the whole tenant + + after starting the detach. + + What remains not tested by this: + - shutdown winning over complete + + Shutdown winning over complete needs gc blocking and reparenting any left-overs on retry. + """ + + if sharded and mode == "delete_tenant": + # the shared/exclusive lock for tenant is blocking this: + # timeline detach ancestor takes shared, delete tenant takes exclusive + pytest.skip( + "tenant deletion while timeline ancestor detach is underway is not supported yet" + ) + + shard_count = 2 if sharded else 1 + + neon_env_builder.num_pageservers = shard_count + + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count if sharded else None) + + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + detached_timeline = env.neon_cli.create_branch("detached soon", "main") + + failpoint = "timeline-detach-ancestor::before_starting_after_locking_pausable" + + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + + assert len(set(info["node_id"] for info in shards)) == shard_count + + target = env.storage_controller.pageserver_api() if sharded else env.pageserver.http_client() + target = target.without_status_retrying() + + victim = pageservers[int(shards[-1]["node_id"])] + victim_http = victim.http_client() + victim_http.configure_failpoints((failpoint, "pause")) + + def detach_ancestor(): + target.detach_ancestor(env.initial_tenant, detached_timeline) + + def at_failpoint() -> Tuple[str, LogCursor]: + return victim.assert_log_contains(f"at failpoint {failpoint}") + + def start_delete(): + if mode == "delete_timeline": + target.timeline_delete(env.initial_tenant, detached_timeline) + elif mode == "delete_tenant": + target.tenant_delete(env.initial_tenant) + else: + raise RuntimeError(f"unimplemented mode {mode}") + + def at_waiting_on_gate_close(start_offset: LogCursor) -> LogCursor: + _, offset = victim.assert_log_contains( + "closing is taking longer than expected", offset=start_offset + ) + return offset + + def is_deleted(): + try: + if mode == "delete_timeline": + target.timeline_detail(env.initial_tenant, detached_timeline) + elif mode == "delete_tenant": + target.tenant_status(env.initial_tenant) + else: + return False + except PageserverApiException as e: + assert e.status_code == 404 + return True + else: + raise RuntimeError("waiting for 404") + + with ThreadPoolExecutor(max_workers=2) as pool: + try: + fut = pool.submit(detach_ancestor) + _, offset = wait_until(10, 1.0, at_failpoint) + + delete = pool.submit(start_delete) + + wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) + + victim_http.configure_failpoints((failpoint, "off")) + + delete.result() + + assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}" + + with pytest.raises(PageserverApiException) as exc: + fut.result() + assert exc.value.status_code == 503 + finally: + victim_http.configure_failpoints((failpoint, "off")) + + +@pytest.mark.parametrize("mode", ["delete_reparentable_timeline"]) +def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnvBuilder, mode: str): + """ + Technically possible storage controller concurrent interleaving timeline + deletion with timeline detach. + + Deletion is fine, as any sharded pageservers reach the same end state, but + creating reparentable timeline would create an issue as the two nodes would + never agree. There is a solution though: the created reparentable timeline + must be detached. + """ + + assert ( + mode == "delete_reparentable_timeline" + ), "only one now, but we could have the create just as well, need gc blocking" + + shard_count = 2 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + assert len(set(x["node_id"] for x in shards)) == shard_count + + with env.endpoints.create_start("main") as ep: + ep.safe_psql("create table foo as select i::bigint from generate_series(1, 1000) t(i)") + + # as the interleaved operation, we will delete this timeline, which was reparenting candidate + first_branch_lsn = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]: + ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline) + + ep.safe_psql("create table bar as select i::bigint from generate_series(1, 2000) t(i)") + detached_branch_lsn = flush_ep_to_pageserver( + env, ep, env.initial_tenant, env.initial_timeline + ) + + for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]: + ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline) + + first_branch = env.neon_cli.create_branch( + "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn + ) + detached_branch = env.neon_cli.create_branch( + "detached_branch", ancestor_branch_name="main", ancestor_start_lsn=detached_branch_lsn + ) + + pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable" + + stuck = pageservers[int(shards[0]["node_id"])] + stuck_http = stuck.http_client().without_status_retrying() + stuck_http.configure_failpoints((pausepoint, "pause")) + + victim = pageservers[int(shards[-1]["node_id"])] + victim_http = victim.http_client().without_status_retrying() + victim_http.configure_failpoints( + (pausepoint, "pause"), + ) + + # noticed a surprising 409 if the other one would fail instead + # victim_http.configure_failpoints([ + # (pausepoint, "pause"), + # ("timeline-detach-ancestor::before_starting_after_locking", "return"), + # ]) + + # interleaving a create_timeline which could be reparented will produce two + # permanently different reparentings: one node has reparented, other has + # not + # + # with deletion there is no such problem + def detach_timeline(): + env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, detached_branch) + + def paused_at_failpoint(): + stuck.assert_log_contains(f"at failpoint {pausepoint}") + victim.assert_log_contains(f"at failpoint {pausepoint}") + + def first_completed(): + detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch) + log.info(detail) + assert detail.get("ancestor_lsn") is None + + def first_branch_gone(): + try: + env.storage_controller.pageserver_api().timeline_detail( + env.initial_tenant, first_branch + ) + except PageserverApiException as e: + log.info(f"error {e}") + assert e.status_code == 404 + else: + log.info("still ok") + raise RuntimeError("not done yet") + + with ThreadPoolExecutor(max_workers=1) as pool: + try: + fut = pool.submit(detach_timeline) + wait_until(10, 1.0, paused_at_failpoint) + + # let stuck complete + stuck_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_completed) + + # if we would let victim fail, for some reason there'd be a 409 response instead of 500 + # victim_http.configure_failpoints((pausepoint, "off")) + # with pytest.raises(PageserverApiException, match=".* 500 Internal Server Error failpoint: timeline-detach-ancestor::before_starting_after_locking") as exc: + # fut.result() + # assert exc.value.status_code == 409 + + env.storage_controller.pageserver_api().timeline_delete( + env.initial_tenant, first_branch + ) + victim_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_branch_gone) + + # it now passes, and we should get an error messages about mixed reparenting as the stuck still had something to reparent + fut.result() + + msg, offset = env.storage_controller.assert_log_contains( + ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*" + ) + log.info(f"expected error message: {msg}") + env.storage_controller.allowed_errors.append( + ".*: shards returned different results matching=0 .*" + ) + + detach_timeline() + + # FIXME: perhaps the above should be automatically retried, if we get mixed results? + not_found = env.storage_controller.log_contains( + ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*", + offset=offset, + ) + + assert not_found is None + finally: + stuck_http.configure_failpoints((pausepoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) + + # TODO: -# - after starting the operation, tenant is deleted # - after starting the operation, pageserver is shutdown, restarted # - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited # - deletion of reparented while reparenting should fail once, then succeed (?) # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. +# +# TEST: 1. tad which partially succeeds, one returns 500 +# 2. create branch below timeline? ~or delete reparented timeline~ (done) +# 3. on retry all should report the same reparented timelines diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index e0ad4fdd5c..f02f19c588 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2065,6 +2065,11 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int): log.info(f"Original digest: {orig_digest}") for sk in env.safekeepers: + wait( + partial(is_flush_lsn_caught_up, sk, tenant_id, timeline_id, lsn), + f"sk_id={sk.id} to flush {lsn}", + ) + sk.http_client().copy_timeline( tenant_id, timeline_id, @@ -2237,6 +2242,8 @@ def test_s3_eviction( check_values = [0] * n_timelines + event_metrics_seen = False + n_iters = 20 for _ in range(n_iters): if log.isEnabledFor(logging.DEBUG): @@ -2261,6 +2268,27 @@ def test_s3_eviction( # update remote_consistent_lsn on pageserver ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True) + # Do metrics check before restarts, since these will reset to zero across a restart + event_metrics_seen |= any( + sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "restore"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "restore"} + ) + or 0 > 0 + for sk in env.safekeepers + ) + # restarting random safekeepers for sk in env.safekeepers: if random.random() < restart_chance: @@ -2275,22 +2303,4 @@ def test_s3_eviction( for sk in env.safekeepers ) - assert any( - sk.http_client().get_metric_value( - "safekeeper_eviction_events_started_total", {"kind": "evict"} - ) - or 0 > 0 - and sk.http_client().get_metric_value( - "safekeeper_eviction_events_completed_total", {"kind": "evict"} - ) - or 0 > 0 - and sk.http_client().get_metric_value( - "safekeeper_eviction_events_started_total", {"kind": "restore"} - ) - or 0 > 0 - and sk.http_client().get_metric_value( - "safekeeper_eviction_events_completed_total", {"kind": "restore"} - ) - or 0 > 0 - for sk in env.safekeepers - ) + assert event_metrics_seen