From 0322e2720f3be63f9f7c7455f6ff3d52cdbef15f Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 16 May 2023 12:46:28 +0100 Subject: [PATCH 01/27] Nightly Benchmarks: add neonvm to pgbench-compare (#4225) --- .github/workflows/benchmarking.yml | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 11363b2407..08b74a2656 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -16,12 +16,12 @@ on: workflow_dispatch: # adds ability to run this manually inputs: region_id: - description: 'Use a particular region. If not set the default region will be used' + description: 'Project region id. If not set, the default region will be used' required: false default: 'aws-us-east-2' save_perf_report: type: boolean - description: 'Publish perf report or not. If not set, the report is published only for the main branch' + description: 'Publish perf report. If not set, the report will be published only for the main branch' required: false defaults: @@ -125,13 +125,14 @@ jobs: matrix='{ "platform": [ "neon-captest-new", - "neon-captest-reuse" + "neon-captest-reuse", + "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [ - { "platform": "neon-captest-freetier", "db_size": "3gb" }, - { "platform": "neon-captest-new", "db_size": "50gb" } - ] + "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, + { "platform": "neon-captest-new", "db_size": "50gb" }, + { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, + { "platform": "neonvm-captest-new", "db_size": "50gb" }] }' if [ "$(date +%A)" = "Saturday" ]; then @@ -197,7 +198,7 @@ jobs: echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - name: Create Neon Project - if: contains(fromJson('["neon-captest-new", "neon-captest-freetier"]'), matrix.platform) + if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -205,6 +206,7 @@ jobs: postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }} + provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }} - name: Set up Connection String id: set-up-connstr @@ -213,7 +215,7 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; - neon-captest-new | neon-captest-freetier) + neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -223,7 +225,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; *) - echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-freetier', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}" exit 1 ;; esac From fdc1c12fb0cd7c85f96380bfa779fb851b39a319 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Tue, 16 May 2023 08:13:54 -0400 Subject: [PATCH 02/27] Simplify github PR template (#4241) --- .github/pull_request_template.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 816c5ee711..22c025dd89 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,6 +1,6 @@ -## Describe your changes +## Problem -## Issue ticket number and link +## Summary of changes ## Checklist before requesting a review From a0b34e8c49cd367c6066d6aeb34373048e1b2cf8 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 16 May 2023 15:15:29 +0300 Subject: [PATCH 03/27] add create tenant metric to storage operations (#4231) Add a metric to track time spent in create tenant requests Originated from https://github.com/neondatabase/neon/pull/4204 --- pageserver/src/http/routes.rs | 7 +++++++ pageserver/src/metrics.rs | 1 + test_runner/regress/test_tenants.py | 10 ++++++++++ 3 files changed, 18 insertions(+) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 91f4fda5eb..361c7850d6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -19,6 +19,7 @@ use super::models::{ }; use crate::context::{DownloadBehavior, RequestContext}; use crate::disk_usage_eviction_task; +use crate::metrics::STORAGE_TIME_GLOBAL; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; @@ -708,6 +709,11 @@ pub fn html_response(status: StatusCode, data: String) -> Result, async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; + let _timer = STORAGE_TIME_GLOBAL + .get_metric_with_label_values(&["create tenant"]) + .expect("bug") + .start_timer(); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let request_data: TenantCreateRequest = json_request(&mut request).await?; @@ -743,6 +749,7 @@ async fn tenant_create_handler(mut request: Request) -> Result = Lazy::new(|| { diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 8026d7f5c6..5642449ce6 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -217,6 +217,16 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): labels = ",".join([f'{key}="{value}"' for key, value in sample.labels.items()]) log.info(f"{sample.name}{{{labels}}} {sample.value}") + # Test that we gather tenant create metric + storage_operation_metrics = [ + "pageserver_storage_operations_seconds_global_bucket", + "pageserver_storage_operations_seconds_global_sum", + "pageserver_storage_operations_seconds_global_count", + ] + for metric in storage_operation_metrics: + value = ps_metrics.query_all(metric, filter={"operation": "create tenant"}) + assert value + @pytest.mark.parametrize( "remote_storage_kind", From a65e0774a50214616efb24149d8e43c1cc6bd9bb Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 16 May 2023 14:06:47 +0100 Subject: [PATCH 04/27] Increase shared memory size for regression test run (#4232) Should fix flakiness caused by the error ``` FATAL: could not resize shared memory segment "/PostgreSQL.3944613150" to 1048576 bytes: No space left on device ``` --- .github/workflows/build_and_test.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5a09f0b4aa..07134678f6 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -324,7 +324,8 @@ jobs: runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned - options: --init + # Default shared memory is 64mb + options: --init --shm-size=512mb needs: [ build-neon ] strategy: fail-fast: false @@ -363,7 +364,8 @@ jobs: runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned - options: --init + # Default shared memory is 64mb + options: --init --shm-size=512mb needs: [ build-neon ] if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: From 4a67f60a3b572176e33c7873f9f391f85460c1f2 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Tue, 16 May 2023 09:09:50 -0400 Subject: [PATCH 05/27] bump aws dep version (#4237) This PR is simply the patch from https://github.com/neondatabase/neon/issues/4008 except we enabled `force_path_style` for custom endpoints. This is because at some version, the s3 sdk by default uses the virtual-host style access, which is not supported by MinIO in the default configuration. By enforcing path style access for custom endpoints, we can pass all e2e test cases. SDK 0.55 is not the latest version and we can bump it further later when all flaky tests in this PR are resolved. This PR also (hopefully) fixes flaky test `test_ondemand_download_timetravel`. close https://github.com/neondatabase/neon/issues/4008 Signed-off-by: Alex Chi --- Cargo.lock | 339 ++++++------------ Cargo.toml | 7 +- libs/remote_storage/Cargo.toml | 1 + libs/remote_storage/src/s3_bucket.rs | 47 +-- test_runner/regress/test_ondemand_download.py | 5 +- 5 files changed, 145 insertions(+), 254 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 167acf0e69..55418473d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -230,40 +230,38 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56a636c44c77fa18bdba56126a34d30cfe5538fe88f7d34988fa731fee143ddd" +checksum = "fc00553f5f3c06ffd4510a9d576f92143618706c45ea6ff81e84ad9be9588abd" dependencies = [ + "aws-credential-types", "aws-http", - "aws-sdk-sso", "aws-sdk-sts", - "aws-smithy-async 0.51.0", - "aws-smithy-client 0.51.0", - "aws-smithy-http 0.51.0", - "aws-smithy-http-tower 0.51.0", + "aws-smithy-async", + "aws-smithy-client", + "aws-smithy-http", + "aws-smithy-http-tower", "aws-smithy-json", - "aws-smithy-types 0.51.0", - "aws-types 0.51.0", + "aws-smithy-types", + "aws-types", "bytes", - "hex", + "fastrand", "http", "hyper", - "ring", "time", "tokio", "tower", "tracing", - "zeroize", ] [[package]] name = "aws-credential-types" -version = "0.55.1" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4232d3729eefc287adc0d5a8adc97b7d94eefffe6bbe94312cc86c7ab6b06ce" +checksum = "4cb57ac6088805821f78d282c0ba8aec809f11cbee10dda19a97b03ab040ccc2" dependencies = [ - "aws-smithy-async 0.55.1", - "aws-smithy-types 0.55.1", + "aws-smithy-async", + "aws-smithy-types", "fastrand", "tokio", "tracing", @@ -272,13 +270,13 @@ dependencies = [ [[package]] name = "aws-endpoint" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ca8f374874f6459aaa88dc861d7f5d834ca1ff97668eae190e97266b5f6c3fb" +checksum = "9c5f6f84a4f46f95a9bb71d9300b73cd67eb868bc43ae84f66ad34752299f4ac" dependencies = [ - "aws-smithy-http 0.51.0", - "aws-smithy-types 0.51.0", - "aws-types 0.51.0", + "aws-smithy-http", + "aws-smithy-types", + "aws-types", "http", "regex", "tracing", @@ -286,13 +284,14 @@ dependencies = [ [[package]] name = "aws-http" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78d41e19e779b73463f5f0c21b3aacc995f4ba783ab13a7ae9f5dfb159a551b4" +checksum = "a754683c322f7dc5167484266489fdebdcd04d26e53c162cad1f3f949f2c5671" dependencies = [ - "aws-smithy-http 0.51.0", - "aws-smithy-types 0.51.0", - "aws-types 0.51.0", + "aws-credential-types", + "aws-smithy-http", + "aws-smithy-types", + "aws-types", "bytes", "http", "http-body", @@ -304,127 +303,104 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "0.21.0" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9f08665c8e03aca8cb092ef01e617436ebfa977fddc1240e1b062488ab5d48a" +checksum = "392b9811ca489747ac84349790e49deaa1f16631949e7dd4156000251c260eae" dependencies = [ + "aws-credential-types", "aws-endpoint", "aws-http", "aws-sig-auth", "aws-sigv4", - "aws-smithy-async 0.51.0", + "aws-smithy-async", "aws-smithy-checksums", - "aws-smithy-client 0.51.0", + "aws-smithy-client", "aws-smithy-eventstream", - "aws-smithy-http 0.51.0", - "aws-smithy-http-tower 0.51.0", - "aws-smithy-types 0.51.0", + "aws-smithy-http", + "aws-smithy-http-tower", + "aws-smithy-json", + "aws-smithy-types", "aws-smithy-xml", - "aws-types 0.51.0", + "aws-types", "bytes", - "bytes-utils", "http", "http-body", + "once_cell", + "percent-encoding", + "regex", "tokio-stream", "tower", "tracing", + "url", +] + +[[package]] +name = "aws-sdk-sts" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d0fbe3c2c342bc8dfea4bb43937405a8ec06f99140a0dcb9c7b59e54dfa93a1" +dependencies = [ + "aws-credential-types", + "aws-endpoint", + "aws-http", + "aws-sig-auth", + "aws-smithy-async", + "aws-smithy-client", + "aws-smithy-http", + "aws-smithy-http-tower", + "aws-smithy-json", + "aws-smithy-query", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "bytes", + "http", + "regex", + "tower", + "tracing", ] -[[package]] -name = "aws-sdk-sso" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86dcb1cb71aa8763b327542ead410424515cff0cde5b753eedd2917e09c63734" -dependencies = [ - "aws-endpoint", - "aws-http", - "aws-sig-auth", - "aws-smithy-async 0.51.0", - "aws-smithy-client 0.51.0", - "aws-smithy-http 0.51.0", - "aws-smithy-http-tower 0.51.0", - "aws-smithy-json", - "aws-smithy-types 0.51.0", - "aws-types 0.51.0", - "bytes", - "http", - "tokio-stream", - "tower", -] - -[[package]] -name = "aws-sdk-sts" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdfcf584297c666f6b472d5368a78de3bc714b6e0a53d7fbf76c3e347c292ab1" -dependencies = [ - "aws-endpoint", - "aws-http", - "aws-sig-auth", - "aws-smithy-async 0.51.0", - "aws-smithy-client 0.51.0", - "aws-smithy-http 0.51.0", - "aws-smithy-http-tower 0.51.0", - "aws-smithy-query", - "aws-smithy-types 0.51.0", - "aws-smithy-xml", - "aws-types 0.51.0", - "bytes", - "http", - "tower", -] - [[package]] name = "aws-sig-auth" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12cbe7b2be9e185c1fbce27fc9c41c66b195b32d89aa099f98768d9544221308" +checksum = "84dc92a63ede3c2cbe43529cb87ffa58763520c96c6a46ca1ced80417afba845" dependencies = [ + "aws-credential-types", "aws-sigv4", "aws-smithy-eventstream", - "aws-smithy-http 0.51.0", - "aws-types 0.51.0", + "aws-smithy-http", + "aws-types", "http", "tracing", ] [[package]] name = "aws-sigv4" -version = "0.51.1" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c0b2658d2cb66dbf02f0e8dee80810ef1e0ca3530ede463e0ef994c301087d1" +checksum = "392fefab9d6fcbd76d518eb3b1c040b84728ab50f58df0c3c53ada4bea9d327e" dependencies = [ "aws-smithy-eventstream", - "aws-smithy-http 0.51.0", + "aws-smithy-http", "bytes", "form_urlencoded", "hex", + "hmac", "http", "once_cell", "percent-encoding", "regex", - "ring", + "sha2", "time", "tracing", ] [[package]] name = "aws-smithy-async" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b3442b4c5d3fc39891a2e5e625735fba6b24694887d49c6518460fde98247a9" -dependencies = [ - "futures-util", - "pin-project-lite", - "tokio", - "tokio-stream", -] - -[[package]] -name = "aws-smithy-async" -version = "0.55.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88573bcfbe1dcfd54d4912846df028b42d6255cbf9ce07be216b1bbfd11fc4b9" +checksum = "ae23b9fe7a07d0919000116c4c5c0578303fbce6fc8d32efca1f7759d4c20faf" dependencies = [ "futures-util", "pin-project-lite", @@ -434,12 +410,12 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc227e36e346f45298288359f37123e1a92628d1cec6b11b5eb335553278bd9e" +checksum = "a6367acbd6849b8c7c659e166955531274ae147bf83ab4312885991f6b6706cb" dependencies = [ - "aws-smithy-http 0.51.0", - "aws-smithy-types 0.51.0", + "aws-smithy-http", + "aws-smithy-types", "bytes", "crc32c", "crc32fast", @@ -455,14 +431,14 @@ dependencies = [ [[package]] name = "aws-smithy-client" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff28d553714f8f54cd921227934fc13a536a1c03f106e56b362fd57e16d450ad" +checksum = "5230d25d244a51339273b8870f0f77874cd4449fb4f8f629b21188ae10cfc0ba" dependencies = [ - "aws-smithy-async 0.51.0", - "aws-smithy-http 0.51.0", - "aws-smithy-http-tower 0.51.0", - "aws-smithy-types 0.51.0", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-http-tower", + "aws-smithy-types", "bytes", "fastrand", "http", @@ -471,26 +447,7 @@ dependencies = [ "hyper-rustls", "lazy_static", "pin-project-lite", - "tokio", - "tower", - "tracing", -] - -[[package]] -name = "aws-smithy-client" -version = "0.55.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2f52352bae50d3337d5d6151b695d31a8c10ebea113eca5bead531f8301b067" -dependencies = [ - "aws-smithy-async 0.55.1", - "aws-smithy-http 0.55.1", - "aws-smithy-http-tower 0.55.1", - "aws-smithy-types 0.55.1", - "bytes", - "fastrand", - "http", - "http-body", - "pin-project-lite", + "rustls 0.20.8", "tokio", "tower", "tracing", @@ -498,23 +455,23 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7ea0df7161ce65b5c8ca6eb709a1a907376fa18226976e41c748ce02ccccf24" +checksum = "22d2a2bcc16e5c4d949ffd2b851da852b9bbed4bb364ed4ae371b42137ca06d9" dependencies = [ - "aws-smithy-types 0.51.0", + "aws-smithy-types", "bytes", "crc32fast", ] [[package]] name = "aws-smithy-http" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf58ed4fefa61dbf038e5421a521cbc2c448ef69deff0ab1d915d8a10eda5664" +checksum = "b60e2133beb9fe6ffe0b70deca57aaeff0a35ad24a9c6fab2fd3b4f45b99fdb5" dependencies = [ "aws-smithy-eventstream", - "aws-smithy-types 0.51.0", + "aws-smithy-types", "bytes", "bytes-utils", "futures-core", @@ -530,49 +487,14 @@ dependencies = [ "tracing", ] -[[package]] -name = "aws-smithy-http" -version = "0.55.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03bcc02d7ed9649d855c8ce4a735e9848d7b8f7568aad0504c158e3baa955df8" -dependencies = [ - "aws-smithy-types 0.55.1", - "bytes", - "bytes-utils", - "futures-core", - "http", - "http-body", - "hyper", - "once_cell", - "percent-encoding", - "pin-project-lite", - "pin-utils", - "tracing", -] - [[package]] name = "aws-smithy-http-tower" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20c96d7bd35e7cf96aca1134b2f81b1b59ffe493f7c6539c051791cbbf7a42d3" +checksum = "3a4d94f556c86a0dd916a5d7c39747157ea8cb909ca469703e20fee33e448b67" dependencies = [ - "aws-smithy-http 0.51.0", - "bytes", - "http", - "http-body", - "pin-project-lite", - "tower", - "tracing", -] - -[[package]] -name = "aws-smithy-http-tower" -version = "0.55.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da88b3a860f65505996c29192d800f1aeb9480440f56d63aad33a3c12045017a" -dependencies = [ - "aws-smithy-http 0.55.1", - "aws-smithy-types 0.55.1", + "aws-smithy-http", + "aws-smithy-types", "bytes", "http", "http-body", @@ -583,40 +505,28 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8324ba98c8a94187723cc16c37aefa09504646ee65c3d2c3af495bab5ea701b" +checksum = "5ce3d6e6ebb00b2cce379f079ad5ec508f9bcc3a9510d9b9c1840ed1d6f8af39" dependencies = [ - "aws-smithy-types 0.51.0", + "aws-smithy-types", ] [[package]] name = "aws-smithy-query" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83834ed2ff69ea6f6657baf205267dc2c0abe940703503a3e5d60ce23be3d306" +checksum = "d58edfca32ef9bfbc1ca394599e17ea329cb52d6a07359827be74235b64b3298" dependencies = [ - "aws-smithy-types 0.51.0", + "aws-smithy-types", "urlencoding", ] [[package]] name = "aws-smithy-types" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b02e06ea63498c43bc0217ea4d16605d4e58d85c12fc23f6572ff6d0a840c61" -dependencies = [ - "itoa", - "num-integer", - "ryu", - "time", -] - -[[package]] -name = "aws-smithy-types" -version = "0.55.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0afc731fd1417d791f9145a1e0c30e23ae0beaab9b4814017708ead2fc20f1" +checksum = "58db46fc1f4f26be01ebdb821751b4e2482cd43aa2b64a0348fb89762defaffa" dependencies = [ "base64-simd", "itoa", @@ -627,40 +537,24 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "246e9f83dd1fdf5d347fa30ae4ad30a9d1d42ce4cd74a93d94afa874646f94cd" +checksum = "fb557fe4995bd9ec87fb244bbb254666a971dc902a783e9da8b7711610e9664c" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05701d32da168b44f7ee63147781aed8723e792cc131cb9b18363b5393f17f70" -dependencies = [ - "aws-smithy-async 0.51.0", - "aws-smithy-client 0.51.0", - "aws-smithy-http 0.51.0", - "aws-smithy-types 0.51.0", - "http", - "rustc_version", - "tracing", - "zeroize", -] - -[[package]] -name = "aws-types" -version = "0.55.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9b082e329d9a304d39e193ad5c7ab363a0d6507aca6965e0673a746686fb0cc" +checksum = "de0869598bfe46ec44ffe17e063ed33336e59df90356ca8ff0e8da6f7c1d994b" dependencies = [ "aws-credential-types", - "aws-smithy-async 0.55.1", - "aws-smithy-client 0.55.1", - "aws-smithy-http 0.55.1", - "aws-smithy-types 0.55.1", + "aws-smithy-async", + "aws-smithy-client", + "aws-smithy-http", + "aws-smithy-types", "http", "rustc_version", "tracing", @@ -3367,9 +3261,10 @@ dependencies = [ "anyhow", "async-trait", "aws-config", + "aws-credential-types", "aws-sdk-s3", - "aws-smithy-http 0.51.0", - "aws-types 0.55.1", + "aws-smithy-http", + "aws-types", "hyper", "metrics", "once_cell", diff --git a/Cargo.toml b/Cargo.toml index b73e29ef6c..c901532f86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,9 +21,10 @@ anyhow = { version = "1.0", features = ["backtrace"] } async-stream = "0.3" async-trait = "0.1" atty = "0.2.14" -aws-config = { version = "0.51.0", default-features = false, features=["rustls"] } -aws-sdk-s3 = "0.21.0" -aws-smithy-http = "0.51.0" +aws-config = { version = "0.55", default-features = false, features=["rustls"] } +aws-sdk-s3 = "0.25" +aws-smithy-http = "0.55" +aws-credential-types = "0.55" aws-types = "0.55" base64 = "0.13.0" bincode = "1.3" diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index da15823b69..0877a38dd9 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -12,6 +12,7 @@ aws-smithy-http.workspace = true aws-types.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true +aws-credential-types.workspace = true hyper = { workspace = true, features = ["stream"] } serde.workspace = true serde_json.workspace = true diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index e6c1e19ad5..0be8c72fe0 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -9,14 +9,15 @@ use std::sync::Arc; use anyhow::Context; use aws_config::{ environment::credentials::EnvironmentVariableCredentialsProvider, - imds::credentials::ImdsCredentialsProvider, - meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider}, + imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain, }; +use aws_credential_types::cache::CredentialsCache; use aws_sdk_s3::{ - config::Config, - error::{GetObjectError, GetObjectErrorKind}, - types::{ByteStream, SdkError}, - Client, Endpoint, Region, + config::{Config, Region}, + error::SdkError, + operation::get_object::GetObjectError, + primitives::ByteStream, + Client, }; use aws_smithy_http::body::SdkBody; use hyper::Body; @@ -125,28 +126,23 @@ impl S3Bucket { let credentials_provider = { // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - let env_creds = EnvironmentVariableCredentialsProvider::new(); + CredentialsProviderChain::first_try( + "env", + EnvironmentVariableCredentialsProvider::new(), + ) // uses imds v2 - let imds = ImdsCredentialsProvider::builder().build(); - - // finally add caching. - // this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629 - LazyCachingCredentialsProvider::builder() - .load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds)) - .build() + .or_else("imds", ImdsCredentialsProvider::builder().build()) }; let mut config_builder = Config::builder() .region(Region::new(aws_config.bucket_region.clone())) + .credentials_cache(CredentialsCache::lazy()) .credentials_provider(credentials_provider); if let Some(custom_endpoint) = aws_config.endpoint.clone() { - let endpoint = Endpoint::immutable( - custom_endpoint - .parse() - .expect("Failed to parse S3 custom endpoint"), - ); - config_builder.set_endpoint_resolver(Some(Arc::new(endpoint))); + config_builder = config_builder + .endpoint_url(custom_endpoint) + .force_path_style(true); } let client = Client::from_conf(config_builder.build()); @@ -229,14 +225,9 @@ impl S3Bucket { ))), }) } - Err(SdkError::ServiceError { - err: - GetObjectError { - kind: GetObjectErrorKind::NoSuchKey(..), - .. - }, - .. - }) => Err(DownloadError::NotFound), + Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => { + Err(DownloadError::NotFound) + } Err(e) => { metrics::inc_get_object_fail(); Err(DownloadError::Other(anyhow::anyhow!( diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 5c02708457..31f6c1f3d9 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -20,6 +20,7 @@ from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, wait_for_upload, + wait_for_upload_queue_empty, wait_until_tenant_state, ) from fixtures.types import Lsn @@ -149,6 +150,7 @@ def test_ondemand_download_timetravel( ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() # Override defaults, to create more layers tenant, _ = env.neon_cli.create_tenant( @@ -225,7 +227,8 @@ def test_ondemand_download_timetravel( assert filled_current_physical == filled_size, "we don't yet do layer eviction" # Wait until generated image layers are uploaded to S3 - time.sleep(3) + if remote_storage_kind is not None: + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id) env.pageserver.stop() From efe9e131a706d789d8d294574ce03f99c41e4895 Mon Sep 17 00:00:00 2001 From: MMeent Date: Tue, 16 May 2023 15:23:50 +0200 Subject: [PATCH 06/27] Update vendored PostgreSQL to latest patch releases (#4208) Conflicts: - Changes in PG15's xlogrecovery.c resulted in non-substantial conflicts between ecb01e6ebb5a67f3fc00840695682a8b1ba40461 and aee72b7be903e52d9bdc6449aa4c17fb852d8708 Fixes #4207 --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index a2daebc6b4..1144aee166 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit a2daebc6b445dcbcca9c18e1711f47c1db7ffb04 +Subproject commit 1144aee1661c79eec65e784a8dad8bd450d9df79 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 2df2ce3744..1984832c74 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 2df2ce374464a7449e15dfa46c956b73b4f4098b +Subproject commit 1984832c740a7fa0e468bb720f40c525b652835d From b7db62411b6376ecd9318d3dc74081e96839a523 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 16 May 2023 16:54:29 +0300 Subject: [PATCH 07/27] Make storage time operations an enum instead of an array (#4238) Use an enum instead of an array. Before that there was no connection between definition of the metric and point where it was used aside from matching string literals. Now its possible to use IDE features to check for references. Also this allows to avoid mismatch between set of metrics that was defined and set of metrics that was actually used What is interesting is that `init logical size` case is not used. I think `LogicalSize` is a duplicate of `InitLogicalSize`. So removed the latter. --- pageserver/src/http/routes.rs | 4 +-- pageserver/src/metrics.rs | 68 ++++++++++++++++++++++++----------- 2 files changed, 49 insertions(+), 23 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 361c7850d6..26cd02e5ed 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -19,7 +19,7 @@ use super::models::{ }; use crate::context::{DownloadBehavior, RequestContext}; use crate::disk_usage_eviction_task; -use crate::metrics::STORAGE_TIME_GLOBAL; +use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; @@ -710,7 +710,7 @@ async fn tenant_create_handler(mut request: Request) -> Result = Lazy::new(|| { register_counter_vec!( @@ -673,7 +690,9 @@ pub struct StorageTimeMetrics { } impl StorageTimeMetrics { - pub fn new(operation: &str, tenant_id: &str, timeline_id: &str) -> Self { + pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self { + let operation: &'static str = operation.into(); + let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE .get_metric_with_label_values(&[operation, tenant_id, timeline_id]) .unwrap(); @@ -737,16 +756,23 @@ impl TimelineMetrics { let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let flush_time_histo = StorageTimeMetrics::new("layer flush", &tenant_id, &timeline_id); - let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id); + let flush_time_histo = + StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id); + let compact_time_histo = + StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id); let create_images_time_histo = - StorageTimeMetrics::new("create images", &tenant_id, &timeline_id); - let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id); - let imitate_logical_size_histo = - StorageTimeMetrics::new("imitate logical size", &tenant_id, &timeline_id); + StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id); + let logical_size_histo = + StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id); + let imitate_logical_size_histo = StorageTimeMetrics::new( + StorageTimeOperation::ImitateLogicalSize, + &tenant_id, + &timeline_id, + ); let load_layer_map_histo = - StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id); - let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id); + StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id); + let garbage_collect_histo = + StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id); let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); @@ -814,7 +840,7 @@ impl Drop for TimelineMetrics { .write() .unwrap() .remove(tenant_id, timeline_id); - for op in STORAGE_TIME_OPERATIONS { + for op in StorageTimeOperation::VARIANTS { let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]); let _ = From 511b0945c3010d3ebb44869014337ca2d1ccca90 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Tue, 16 May 2023 10:38:39 -0400 Subject: [PATCH 08/27] Replace usages of wait_for_active_timeline (#4243) This commit replaces all usages of connection_manager.rs: wait_for_active_timeline with Timeline::wait_to_become_active. wait_to_become_active is better and in the right module. close https://github.com/neondatabase/neon/issues/4189 Co-authored-by: Shany Pozin --- pageserver/src/tenant/timeline.rs | 2 +- .../walreceiver/connection_manager.rs | 46 +++++-------------- 2 files changed, 12 insertions(+), 36 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2543764eca..86d50de132 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -945,7 +945,7 @@ impl Timeline { pub async fn wait_to_become_active( &self, - _ctx: &RequestContext, /* Prepare for use by cancellation */ + _ctx: &RequestContext, // Prepare for use by cancellation ) -> Result<(), TimelineState> { let mut receiver = self.state.subscribe(); loop { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 9cb17ea799..2305844d75 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -28,8 +28,8 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use storage_broker::BrokerClientChannel; use storage_broker::Streaming; +use tokio::select; use tokio::sync::RwLock; -use tokio::{select, sync::watch}; use tracing::*; use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; @@ -50,13 +50,13 @@ pub(super) async fn connection_manager_loop_step( ctx: &RequestContext, manager_status: &RwLock>, ) -> ControlFlow<(), ()> { - let mut timeline_state_updates = connection_manager_state + match connection_manager_state .timeline - .subscribe_for_state_updates(); - - match wait_for_active_timeline(&mut timeline_state_updates).await { - ControlFlow::Continue(()) => {} - ControlFlow::Break(()) => { + .wait_to_become_active(ctx) + .await + { + Ok(()) => {} + Err(_) => { info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop"); return ControlFlow::Break(()); } @@ -72,6 +72,10 @@ pub(super) async fn connection_manager_loop_step( timeline_id: connection_manager_state.timeline.timeline_id, }; + let mut timeline_state_updates = connection_manager_state + .timeline + .subscribe_for_state_updates(); + // Subscribe to the broker updates. Stream shares underlying TCP connection // with other streams on this client (other connection managers). When // object goes out of scope, stream finishes in drop() automatically. @@ -195,34 +199,6 @@ pub(super) async fn connection_manager_loop_step( } } -async fn wait_for_active_timeline( - timeline_state_updates: &mut watch::Receiver, -) -> ControlFlow<(), ()> { - let current_state = *timeline_state_updates.borrow(); - if current_state == TimelineState::Active { - return ControlFlow::Continue(()); - } - - loop { - match timeline_state_updates.changed().await { - Ok(()) => { - let new_state = *timeline_state_updates.borrow(); - match new_state { - TimelineState::Active => { - debug!("Timeline state changed to active, continuing the walreceiver connection manager"); - return ControlFlow::Continue(()); - } - state => { - debug!("Not running the walreceiver connection manager, timeline is not active: {state:?}"); - continue; - } - } - } - Err(_sender_dropped_error) => return ControlFlow::Break(()), - } - } -} - /// Endlessly try to subscribe for broker updates for a given timeline. async fn subscribe_for_timeline_updates( broker_client: &mut BrokerClientChannel, From 131343ed45bcfe7287bfd6cb9e5617606826caf2 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 16 May 2023 17:18:56 +0100 Subject: [PATCH 09/27] Fix regress-tests job for Postgres 15 on release branch (#4253) ## Problem Compatibility tests don't support Postgres 15 yet, but we're still trying to upload compatibility snapshot (which we do not collect). Ref https://github.com/neondatabase/neon/actions/runs/4991394158/jobs/8940369368#step:4:38129 ## Summary of changes Add `pg_version` parameter to `run-python-test-set` actions and do not upload compatibility snapshot for Postgres 15 --- .github/actions/run-python-test-set/action.yml | 11 ++++++++--- .github/workflows/build_and_test.yml | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index d6c960bfda..bb120e9470 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -48,6 +48,10 @@ inputs: description: 'Whether to rerun flaky tests' required: false default: 'false' + pg_version: + description: 'Postgres version to use for tests' + required: false + default: 'v14' runs: using: "composite" @@ -68,7 +72,7 @@ runs: prefix: latest - name: Download compatibility snapshot for Postgres 14 - if: inputs.build_type != 'remote' + if: inputs.build_type != 'remote' && inputs.pg_version == 'v14' uses: ./.github/actions/download with: name: compatibility-snapshot-${{ inputs.build_type }}-pg14 @@ -106,13 +110,14 @@ runs: ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') RERUN_FLAKY: ${{ inputs.rerun_flaky }} + PG_VERSION: ${{ inputs.pg_version }} shell: bash -euxo pipefail {0} run: | # PLATFORM will be embedded in the perf test report # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} - export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14} + export DEFAULT_PG_VERSION=${PG_VERSION#v} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 @@ -193,7 +198,7 @@ runs: fi - name: Upload compatibility snapshot for Postgres 14 - if: github.ref_name == 'release' + if: github.ref_name == 'release' && inputs.pg_version == 'v14' uses: ./.github/actions/upload with: name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 07134678f6..3f5ad59cc3 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -351,8 +351,8 @@ jobs: real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}" real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}" rerun_flaky: true + pg_version: ${{ matrix.pg_version }} env: - DEFAULT_PG_VERSION: ${{ matrix.pg_version }} TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty From 4431779e32f1adc5f6a6a93524fb232eaaeecac1 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 16 May 2023 18:53:17 +0200 Subject: [PATCH 10/27] refactor: attach: use create_tenant_files + schedule_local_tenant_processing (#4235) With this patch, the attach handler now follows the same pattern as tenant create with regards to instantiation of the new tenant: 1. Prepare on-disk state using `create_tenant_files`. 2. Use the same code path as pageserver startup to load it into memory and start background loops (`schedule_local_tenant_processing`). It's a bit sad we can't use the `PageServerConfig::tenant_attaching_mark_file_path` method inside `create_tenant_files` because it operates in a temporary directory. However, it's a small price to pay for the gained simplicity. During implementation, I noticed that we don't handle failures post `create_tenant_files` well. I left TODO comments in the code linking to the issue that I created for this [^1]. Also, I'll dedupe the spawn_load and spawn_attach code in a future commit. refs https://github.com/neondatabase/neon/issues/1555 part of https://github.com/neondatabase/neon/issues/886 (Tenant Relocation) [^1]: https://github.com/neondatabase/neon/issues/4233 --- pageserver/src/http/routes.rs | 14 +++- pageserver/src/tenant.rs | 86 ++++++++++------------ pageserver/src/tenant/mgr.rs | 39 +++++++--- test_runner/regress/test_remote_storage.py | 4 +- test_runner/regress/test_tenant_detach.py | 12 +-- 5 files changed, 84 insertions(+), 71 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 26cd02e5ed..6457d55dff 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -398,9 +398,17 @@ async fn tenant_attach_handler(request: Request) -> Result, let state = get_state(&request); if let Some(remote_storage) = &state.remote_storage { - mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx) - .instrument(info_span!("tenant_attach", tenant = %tenant_id)) - .await?; + mgr::attach_tenant( + state.conf, + tenant_id, + // XXX: Attach should provide the config, especially during tenant migration. + // See https://github.com/neondatabase/neon/issues/1555 + TenantConfOpt::default(), + remote_storage.clone(), + &ctx, + ) + .instrument(info_span!("tenant_attach", tenant = %tenant_id)) + .await?; } else { return Err(ApiError::BadRequest(anyhow!( "attach_tenant is not possible because pageserver was configured without remote storage" diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 9ab0262407..cccad3e4d5 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -602,12 +602,9 @@ impl Tenant { remote_storage: GenericRemoteStorage, ctx: &RequestContext, ) -> anyhow::Result> { - // XXX: Attach should provide the config, especially during tenant migration. - // See https://github.com/neondatabase/neon/issues/1555 - let tenant_conf = TenantConfOpt::default(); - - Self::attach_idempotent_create_marker_file(conf, tenant_id) - .context("create attach marker file")?; + // TODO dedup with spawn_load + let tenant_conf = + Self::load_tenant_config(conf, tenant_id).context("load tenant config")?; let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let tenant = Arc::new(Tenant::new( @@ -644,45 +641,6 @@ impl Tenant { Ok(tenant) } - fn attach_idempotent_create_marker_file( - conf: &'static PageServerConf, - tenant_id: TenantId, - ) -> anyhow::Result<()> { - // Create directory with marker file to indicate attaching state. - // The load_local_tenants() function in tenant::mgr relies on the marker file - // to determine whether a tenant has finished attaching. - let tenant_dir = conf.tenant_path(&tenant_id); - let marker_file = conf.tenant_attaching_mark_file_path(&tenant_id); - debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir); - // TODO: should use tokio::fs here, but - // 1. caller is not async, for good reason (it holds tenants map lock) - // 2. we'd need to think about cancel safety. Turns out dropping a tokio::fs future - // doesn't wait for the activity in the fs thread pool. - crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?; - match fs::OpenOptions::new() - .write(true) - .create_new(true) - .open(&marker_file) - { - Ok(_) => {} - Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { - // Either this is a retry of attach or there is a concurrent task also doing attach for this tenant. - // We cannot distinguish this here. - // The caller is responsible for ensuring there's no concurrent attach for a tenant. - {} // fsync again, we don't know if that already happened - } - err => { - err.context("create tenant attaching marker file")?; - unreachable!("we covered the Ok() case above"); - } - } - crashsafe::fsync_file_and_parent(&marker_file) - .context("fsync tenant attaching marker file and parent")?; - debug_assert!(tenant_dir.is_dir()); - debug_assert!(marker_file.is_file()); - Ok(()) - } - /// /// Background task that downloads all data for a tenant and brings it to Active state. /// @@ -2118,6 +2076,7 @@ impl Tenant { // enough to just fsync it always. crashsafe::fsync(target_config_parent)?; + // XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant` Ok(()) }; @@ -2761,15 +2720,23 @@ fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> a Ok(()) } +pub(crate) enum CreateTenantFilesMode { + Create, + Attach, +} + pub(crate) fn create_tenant_files( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: TenantId, + mode: CreateTenantFilesMode, ) -> anyhow::Result { let target_tenant_directory = conf.tenant_path(&tenant_id); anyhow::ensure!( - !target_tenant_directory.exists(), - "cannot create new tenant repo: '{tenant_id}' directory already exists", + !target_tenant_directory + .try_exists() + .context("check existence of tenant directory")?, + "tenant directory already exists", ); let temporary_tenant_dir = @@ -2791,6 +2758,7 @@ pub(crate) fn create_tenant_files( conf, tenant_conf, tenant_id, + mode, &temporary_tenant_dir, &target_tenant_directory, ); @@ -2815,9 +2783,28 @@ fn try_create_target_tenant_dir( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: TenantId, + mode: CreateTenantFilesMode, temporary_tenant_dir: &Path, target_tenant_directory: &Path, ) -> Result<(), anyhow::Error> { + match mode { + CreateTenantFilesMode::Create => {} // needs no attach marker, writing tenant conf + atomic rename of dir is good enough + CreateTenantFilesMode::Attach => { + let attach_marker_path = temporary_tenant_dir.join(TENANT_ATTACHING_MARKER_FILENAME); + let file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(&attach_marker_path) + .with_context(|| { + format!("could not create attach marker file {attach_marker_path:?}") + })?; + file.sync_all().with_context(|| { + format!("could not sync attach marker file: {attach_marker_path:?}") + })?; + // fsync of the directory in which the file resides comes later in this function + } + } + let temporary_tenant_timelines_dir = rebase_directory( &conf.timelines_path(&tenant_id), target_tenant_directory, @@ -2844,6 +2831,11 @@ fn try_create_target_tenant_dir( anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); }); + // Make sure the current tenant directory entries are durable before renaming. + // Without this, a crash may reorder any of the directory entry creations above. + crashsafe::fsync(temporary_tenant_dir) + .with_context(|| format!("sync temporary tenant directory {temporary_tenant_dir:?}"))?; + fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| { format!( "move tenant {} temporary directory {} into the permanent one {}", diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 8191880bb5..1542d34a66 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -19,7 +19,7 @@ use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; -use crate::tenant::{Tenant, TenantState}; +use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState}; use crate::IGNORED_TENANT_FILE_NAME; use utils::fs_ext::PathExt; @@ -282,9 +282,15 @@ pub async fn create_tenant( // We're holding the tenants lock in write mode while doing local IO. // If this section ever becomes contentious, introduce a new `TenantState::Creating` // and do the work in that state. - let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?; + let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Create)?; + // TODO: tenant directory remains on disk if we bail out from here on. + // See https://github.com/neondatabase/neon/issues/4233 + let created_tenant = schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?; + // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. + // See https://github.com/neondatabase/neon/issues/4233 + let crated_tenant_id = created_tenant.tenant_id(); anyhow::ensure!( tenant_id == crated_tenant_id, @@ -466,19 +472,32 @@ pub async fn list_tenants() -> Result, TenantMapLis pub async fn attach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, + tenant_conf: TenantConfOpt, remote_storage: GenericRemoteStorage, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { tenant_map_insert(tenant_id, |vacant_entry| { - let tenant_path = conf.tenant_path(&tenant_id); - anyhow::ensure!( - !tenant_path.exists(), - "Cannot attach tenant {tenant_id}, local tenant directory already exists" - ); + let tenant_dir = create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Attach)?; + // TODO: tenant directory remains on disk if we bail out from here on. + // See https://github.com/neondatabase/neon/issues/4233 - let tenant = - Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx).context("spawn_attach")?; - vacant_entry.insert(tenant); + // Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached + let marker_file_exists = conf + .tenant_attaching_mark_file_path(&tenant_id) + .try_exists() + .context("check for attach marker file existence")?; + anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file"); + + let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, Some(remote_storage), ctx)?; + // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. + // See https://github.com/neondatabase/neon/issues/4233 + + let attached_tenant_id = attached_tenant.tenant_id(); + anyhow::ensure!( + tenant_id == attached_tenant_id, + "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})", + ); + vacant_entry.insert(Arc::clone(&attached_tenant)); Ok(()) }) .await diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index cce9cdc175..02f1aac99c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -83,9 +83,7 @@ def test_remote_storage_backup_and_restore( env.pageserver.allowed_errors.append(".*failed to load remote timeline.*") # we have a bunch of pytest.raises for these below env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*") - env.pageserver.allowed_errors.append( - ".*Cannot attach tenant .*?, local tenant directory already exists.*" - ) + env.pageserver.allowed_errors.append(".*tenant directory already exists.*") env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*") pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 847ae4b2b8..82664cff94 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -685,12 +685,10 @@ def test_load_attach_negatives( pageserver_http.tenant_ignore(tenant_id) - env.pageserver.allowed_errors.append( - ".*Cannot attach tenant .*?, local tenant directory already exists.*" - ) + env.pageserver.allowed_errors.append(".*tenant directory already exists.*") with pytest.raises( expected_exception=PageserverApiException, - match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists", + match="tenant directory already exists", ): pageserver_http.tenant_attach(tenant_id) @@ -734,12 +732,10 @@ def test_ignore_while_attaching( pageserver_http.tenant_ignore(tenant_id) # Cannot attach it due to some local files existing - env.pageserver.allowed_errors.append( - ".*Cannot attach tenant .*?, local tenant directory already exists.*" - ) + env.pageserver.allowed_errors.append(".*tenant directory already exists.*") with pytest.raises( expected_exception=PageserverApiException, - match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists", + match="tenant directory already exists", ): pageserver_http.tenant_attach(tenant_id) From 1bceceac5afd88b2e367ada318136d068ebfe78a Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 17 May 2023 11:03:46 +0200 Subject: [PATCH 11/27] add helper to debug_assert that current span has a TenantId (#4248) We already have `debug_assert_current_span_has_tenant_and_timeline_id`. Have the same for just TenantId. --- pageserver/src/tenant.rs | 31 +++++++++++++++++++++++++++++++ pageserver/src/tenant/timeline.rs | 8 +------- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cccad3e4d5..8349e1993f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -792,6 +792,8 @@ impl Tenant { remote_client: RemoteTimelineClient, ctx: &RequestContext, ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_id(); + info!("downloading index file for timeline {}", timeline_id); tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id)) .await @@ -1056,6 +1058,8 @@ impl Tenant { local_metadata: TimelineMetadata, ctx: &RequestContext, ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_id(); + let remote_client = self.remote_storage.as_ref().map(|remote_storage| { RemoteTimelineClient::new( remote_storage.clone(), @@ -1585,6 +1589,8 @@ impl Tenant { /// Changes tenant status to active, unless shutdown was already requested. fn activate(&self, ctx: &RequestContext) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_id(); + let mut result = Ok(()); self.state.send_modify(|current_state| { match &*current_state { @@ -3971,3 +3977,28 @@ mod tests { Ok(()) } } + +#[cfg(not(debug_assertions))] +#[inline] +pub(crate) fn debug_assert_current_span_has_tenant_id() {} + +#[cfg(debug_assertions)] +pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy< + utils::tracing_span_assert::MultiNameExtractor<2>, +> = once_cell::sync::Lazy::new(|| { + utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]) +}); + +#[cfg(debug_assertions)] +#[inline] +pub(crate) fn debug_assert_current_span_has_tenant_id() { + use utils::tracing_span_assert; + + match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) { + Ok(()) => (), + Err(missing) => panic!( + "missing extractors: {:?}", + missing.into_iter().map(|e| e.name()).collect::>() + ), + } +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 86d50de132..c47f4444f5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4416,12 +4416,6 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {} pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { use utils::tracing_span_assert; - pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy< - tracing_span_assert::MultiNameExtractor<2>, - > = once_cell::sync::Lazy::new(|| { - tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]) - }); - pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy< tracing_span_assert::MultiNameExtractor<2>, > = once_cell::sync::Lazy::new(|| { @@ -4429,7 +4423,7 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { }); match tracing_span_assert::check_fields_present([ - &*TENANT_ID_EXTRACTOR, + &*super::TENANT_ID_EXTRACTOR, &*TIMELINE_ID_EXTRACTOR, ]) { Ok(()) => (), From ef41b63db7ea815a224aa91bd8e5a7230fa5afd4 Mon Sep 17 00:00:00 2001 From: 0x29a Date: Wed, 17 May 2023 17:25:01 +0800 Subject: [PATCH 12/27] docs: add links to the doc for better read experience (#4258) add links to the doc and refine links for better read experience --- CONTRIBUTING.md | 2 +- README.md | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 43ebefc477..c5b3ff7459 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,7 @@ Howdy! Usual good software engineering practices apply. Write tests. Write comments. Follow standard Rust coding practices where -possible. Use 'cargo fmt' and 'clippy' to tidy up formatting. +possible. Use `cargo fmt` and `cargo clippy` to tidy up formatting. There are soft spots in the code, which could use cleanup, refactoring, additional comments, and so forth. Let's try to raise the diff --git a/README.md b/README.md index ce6ec09d24..1b0164d5c0 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ The Neon storage engine consists of two major components: - Pageserver. Scalable storage backend for the compute nodes. - Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage. -See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information. +See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information. ## Running local installation @@ -238,9 +238,9 @@ CARGO_BUILD_FLAGS="--features=testing" make ## Documentation -[/docs/](/docs/) Contains a top-level overview of all available markdown documentation. +[docs](/docs) Contains a top-level overview of all available markdown documentation. -- [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout. +- [sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout. To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open` @@ -265,6 +265,6 @@ To get more familiar with this aspect, refer to: ## Join the development -- Read `CONTRIBUTING.md` to learn about project code style and practices. -- To get familiar with a source tree layout, use [/docs/sourcetree.md](/docs/sourcetree.md). +- Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices. +- To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md). - To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html From 30fe3106023175d9d1dcc79da51929cdc24ec48b Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 17 May 2023 11:30:07 +0100 Subject: [PATCH 13/27] Code Coverage: upload reports to S3 (#4256) ## Problem `neondatabase/zenith-coverage-data` is too big: - It takes ~6 minutes to clone and push the repo - GitHub fails to publish an HTML report to github.io Part of https://github.com/neondatabase/neon/issues/3543 ## Summary of changes Replace pushing code coverage report to `neondatabase/zenith-coverage-data` with uploading it to S3 --- .github/workflows/build_and_test.yml | 52 ++++++++++++++++------------ 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3f5ad59cc3..9114e02622 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -492,37 +492,43 @@ jobs: - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge - - name: Build and upload coverage report + - name: Build coverage report + env: + COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }} run: | - COMMIT_SHA=${{ github.event.pull_request.head.sha }} - COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} - COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA - scripts/coverage \ --dir=/tmp/coverage report \ --input-objects=/tmp/coverage/binaries.list \ - --commit-url=$COMMIT_URL \ + --commit-url=${COMMIT_URL} \ --format=github - REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA + - name: Upload coverage report + id: upload-coverage-report + env: + BUCKET: neon-github-public-dev + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + run: | + aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA} - scripts/git-upload \ - --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \ - --message="Add code coverage for $COMMIT_URL" \ - copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE + REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html + echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - # Add link to the coverage report to the commit - curl -f -X POST \ - https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"state\": \"success\", - \"context\": \"neon-coverage\", - \"description\": \"Coverage report is ready\", - \"target_url\": \"$REPORT_URL\" - }" + - uses: actions/github-script@v6 + env: + REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + with: + script: | + const { REPORT_URL, COMMIT_SHA } = process.env + + await github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: `${COMMIT_SHA}`, + state: 'success', + target_url: `${REPORT_URL}`, + context: 'Code coverage report', + }) trigger-e2e-tests: runs-on: [ self-hosted, gen3, small ] From 89307822b01453fa6b560d9f3bcd943d681e4267 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 17 May 2023 12:31:17 +0200 Subject: [PATCH 14/27] mgmt api: share a single tenant config model struct in Rust and OpenAPI (#4252) This is prep for https://github.com/neondatabase/neon/pull/4255 [1/X] OpenAPI: share a single definition of TenantConfig DRYs up the pageserver OpenAPI YAML's representation of tenant config. All the fields of tenant config are now located in a model schema called TenantConfig. The tenant create & config-change endpoints have separate schemas, TenantCreateInfo and TenantConfigureArg, respectively. These schemas inherit from TenantConfig, using allOf 1. The tenant config-GET handler's response was previously named TenantConfig. It's now named TenantConfigResponse. None of these changes affect how the request looks on the wire. The generated Go code will change for Console because the OpenAPI code generator maps `allOf` to a Go struct embedding. Luckily, usage of tenant config in Console is still very lightweigt, but that will change in the near future. So, this is a good chance to set things straight. The console changes are tracked in https://github.com/neondatabase/cloud/pull/5046 [2/x]: extract the tenant config parts of create & config requests [3/x]: code movement: move TenantConfigRequestConfig next to TenantCreateRequestConfig [4/x] type-alias TenantConfigRequestConfig = TenantCreateRequestConfig; They are exactly the same. [5/x] switch to qualified use for tenant create/config request api models [6/x] rename models::TenantConfig{RequestConfig,} and remove the alias [7/x] OpenAPI: sync tenant create & configure body names from Rust code [8/x]: dedupe the two TryFrom<...> for TenantConfOpt impls The only difference is that the TenantConfigRequest impl does ``` tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag; tenant_conf.trace_read_requests = request_data.trace_read_requests; ``` and the TenantCreateRequest impl does ``` if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag { tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag); } if let Some(trace_read_requests) = request_data.trace_read_requests { tenant_conf.trace_read_requests = Some(trace_read_requests); } ``` As far as I can tell, these are identical. --- control_plane/src/pageserver.rs | 26 ++++--- libs/pageserver_api/src/models.rs | 51 +++++++------ pageserver/src/http/openapi_spec.yml | 58 +++++++------- pageserver/src/http/routes.rs | 6 +- pageserver/src/tenant/config.rs | 108 ++++++--------------------- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 7 files changed, 95 insertions(+), 158 deletions(-) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 75991045a4..f022be3910 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -8,9 +8,7 @@ use std::process::{Child, Command}; use std::{io, result}; use anyhow::{bail, Context}; -use pageserver_api::models::{ - TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, -}; +use pageserver_api::models::{self, TenantInfo, TimelineInfo}; use postgres_backend::AuthType; use postgres_connection::{parse_host_port, PgConnectionConfig}; use reqwest::blocking::{Client, RequestBuilder, Response}; @@ -316,8 +314,8 @@ impl PageServerNode { settings: HashMap<&str, &str>, ) -> anyhow::Result { let mut settings = settings.clone(); - let request = TenantCreateRequest { - new_tenant_id, + + let config = models::TenantConfig { checkpoint_distance: settings .remove("checkpoint_distance") .map(|x| x.parse::()) @@ -372,6 +370,10 @@ impl PageServerNode { .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), }; + let request = models::TenantCreateRequest { + new_tenant_id, + config, + }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") } @@ -392,9 +394,9 @@ impl PageServerNode { } pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> { - self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))? - .json(&TenantConfigRequest { - tenant_id, + let config = { + // Braces to make the diff easier to read + models::TenantConfig { checkpoint_distance: settings .get("checkpoint_distance") .map(|x| x.parse::()) @@ -451,7 +453,11 @@ impl PageServerNode { evictions_low_residence_duration_metric_threshold: settings .get("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), - }) + } + }; + + self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))? + .json(&models::TenantConfigRequest { tenant_id, config }) .send()? .error_from_body()?; @@ -483,7 +489,7 @@ impl PageServerNode { Method::POST, format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), )? - .json(&TimelineCreateRequest { + .json(&models::TimelineCreateRequest { new_timeline_id, ancestor_start_lsn, ancestor_timeline_id, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index e4df81c9ad..0bcdb3c3a8 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -136,6 +136,20 @@ pub struct TenantCreateRequest { #[serde(default)] #[serde_as(as = "Option")] pub new_tenant_id: Option, + #[serde(flatten)] + pub config: TenantConfig, +} + +impl std::ops::Deref for TenantCreateRequest { + type Target = TenantConfig; + + fn deref(&self) -> &Self::Target { + &self.config + } +} + +#[derive(Serialize, Deserialize, Default)] +pub struct TenantConfig { pub checkpoint_distance: Option, pub checkpoint_timeout: Option, pub compaction_target_size: Option, @@ -182,33 +196,21 @@ impl TenantCreateRequest { pub struct TenantConfigRequest { #[serde_as(as = "DisplayFromStr")] pub tenant_id: TenantId, - #[serde(default)] - pub checkpoint_distance: Option, - pub checkpoint_timeout: Option, - pub compaction_target_size: Option, - pub compaction_period: Option, - pub compaction_threshold: Option, - pub gc_horizon: Option, - pub gc_period: Option, - pub image_creation_threshold: Option, - pub pitr_interval: Option, - pub walreceiver_connect_timeout: Option, - pub lagging_wal_timeout: Option, - pub max_lsn_wal_lag: Option, - pub trace_read_requests: Option, - // We defer the parsing of the eviction_policy field to the request handler. - // Otherwise we'd have to move the types for eviction policy into this package. - // We might do that once the eviction feature has stabilizied. - // For now, this field is not even documented in the openapi_spec.yml. - pub eviction_policy: Option, - pub min_resident_size_override: Option, - pub evictions_low_residence_duration_metric_threshold: Option, + #[serde(flatten)] + pub config: TenantConfig, +} + +impl std::ops::Deref for TenantConfigRequest { + type Target = TenantConfig; + + fn deref(&self) -> &Self::Target { + &self.config + } } impl TenantConfigRequest { pub fn new(tenant_id: TenantId) -> TenantConfigRequest { - TenantConfigRequest { - tenant_id, + let config = TenantConfig { checkpoint_distance: None, checkpoint_timeout: None, compaction_target_size: None, @@ -225,7 +227,8 @@ impl TenantConfigRequest { eviction_policy: None, min_resident_size_override: None, evictions_low_residence_duration_metric_threshold: None, - } + }; + TenantConfigRequest { tenant_id, config } } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 330587310f..62664733ea 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -747,7 +747,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/TenantCreateInfo" + $ref: "#/components/schemas/TenantCreateRequest" responses: "201": description: New tenant created successfully @@ -794,7 +794,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfigRequest" responses: "200": description: OK @@ -846,7 +846,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/TenantConfig" + $ref: "#/components/schemas/TenantConfigResponse" "400": description: Malformed get tenanant config request content: @@ -909,35 +909,27 @@ components: See the tenant `/attach` endpoint for more information. type: string enum: [ "maybe", "attached" ] - TenantCreateInfo: + TenantCreateRequest: + allOf: + - $ref: '#/components/schemas/TenantConfig' + - type: object + properties: + new_tenant_id: + type: string + format: hex + TenantConfigRequest: + allOf: + - $ref: '#/components/schemas/TenantConfig' + - type: object + required: + - tenant_id + properties: + tenant_id: + type: string + format: hex + TenantConfig: type: object properties: - new_tenant_id: - type: string - format: hex - tenant_id: - type: string - format: hex - gc_period: - type: string - gc_horizon: - type: integer - pitr_interval: - type: string - checkpoint_distance: - type: integer - checkpoint_timeout: - type: string - compaction_period: - type: string - compaction_threshold: - type: string - TenantConfigInfo: - type: object - properties: - tenant_id: - type: string - format: hex gc_period: type: string gc_horizon: @@ -964,13 +956,13 @@ components: type: integer trace_read_requests: type: boolean - TenantConfig: + TenantConfigResponse: type: object properties: tenant_specific_overrides: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfig" effective_config: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfig" TimelineInfo: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 6457d55dff..7d60d3568a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -726,7 +726,8 @@ async fn tenant_create_handler(mut request: Request) -> Result(field_name: &'static str, value: &'a str) -> impl 'a + Fn() move || format!("Cannot parse `{field_name}` duration {value:?}") } -impl TenantConfOpt { - #[allow(clippy::too_many_arguments)] - fn from_request( - checkpoint_distance: Option, - checkpoint_timeout: &Option, - compaction_target_size: Option, - compaction_period: &Option, - compaction_threshold: Option, - gc_horizon: Option, - gc_period: &Option, - image_creation_threshold: Option, - pitr_interval: &Option, - walreceiver_connect_timeout: &Option, - lagging_wal_timeout: &Option, - max_lsn_wal_lag: Option, - trace_read_requests: Option, - eviction_policy: &Option, - min_resident_size_override: Option, - evictions_low_residence_duration_metric_threshold: &Option, - ) -> Result { +impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { + type Error = anyhow::Error; + + fn try_from(request_data: &'_ models::TenantConfig) -> Result { let mut tenant_conf = TenantConfOpt::default(); - if let Some(gc_period) = &gc_period { + if let Some(gc_period) = &request_data.gc_period { tenant_conf.gc_period = Some( humantime::parse_duration(gc_period) .with_context(bad_duration("gc_period", gc_period))?, ); } - tenant_conf.gc_horizon = gc_horizon; - tenant_conf.image_creation_threshold = image_creation_threshold; + tenant_conf.gc_horizon = request_data.gc_horizon; + tenant_conf.image_creation_threshold = request_data.image_creation_threshold; - if let Some(pitr_interval) = &pitr_interval { + if let Some(pitr_interval) = &request_data.pitr_interval { tenant_conf.pitr_interval = Some( humantime::parse_duration(pitr_interval) .with_context(bad_duration("pitr_interval", pitr_interval))?, ); } - if let Some(walreceiver_connect_timeout) = &walreceiver_connect_timeout { + if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout { tenant_conf.walreceiver_connect_timeout = Some( humantime::parse_duration(walreceiver_connect_timeout).with_context( bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout), )?, ); } - if let Some(lagging_wal_timeout) = &lagging_wal_timeout { + if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout { tenant_conf.lagging_wal_timeout = Some( humantime::parse_duration(lagging_wal_timeout) .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?, ); } - if let Some(max_lsn_wal_lag) = max_lsn_wal_lag { + if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag { tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag); } - if let Some(trace_read_requests) = trace_read_requests { + if let Some(trace_read_requests) = request_data.trace_read_requests { tenant_conf.trace_read_requests = Some(trace_read_requests); } - tenant_conf.checkpoint_distance = checkpoint_distance; - if let Some(checkpoint_timeout) = &checkpoint_timeout { + tenant_conf.checkpoint_distance = request_data.checkpoint_distance; + if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout { tenant_conf.checkpoint_timeout = Some( humantime::parse_duration(checkpoint_timeout) .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?, ); } - tenant_conf.compaction_target_size = compaction_target_size; - tenant_conf.compaction_threshold = compaction_threshold; + tenant_conf.compaction_target_size = request_data.compaction_target_size; + tenant_conf.compaction_threshold = request_data.compaction_threshold; - if let Some(compaction_period) = &compaction_period { + if let Some(compaction_period) = &request_data.compaction_period { tenant_conf.compaction_period = Some( humantime::parse_duration(compaction_period) .with_context(bad_duration("compaction_period", compaction_period))?, ); } - if let Some(eviction_policy) = &eviction_policy { + if let Some(eviction_policy) = &request_data.eviction_policy { tenant_conf.eviction_policy = Some( serde::Deserialize::deserialize(eviction_policy) .context("parse field `eviction_policy`")?, ); } - tenant_conf.min_resident_size_override = min_resident_size_override; + tenant_conf.min_resident_size_override = request_data.min_resident_size_override; if let Some(evictions_low_residence_duration_metric_threshold) = - &evictions_low_residence_duration_metric_threshold + &request_data.evictions_low_residence_duration_metric_threshold { tenant_conf.evictions_low_residence_duration_metric_threshold = Some( humantime::parse_duration(evictions_low_residence_duration_metric_threshold) @@ -393,56 +377,6 @@ impl TenantConfOpt { } } -impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt { - type Error = anyhow::Error; - - fn try_from(request_data: &TenantCreateRequest) -> Result { - Self::from_request( - request_data.checkpoint_distance, - &request_data.checkpoint_timeout, - request_data.compaction_target_size, - &request_data.compaction_period, - request_data.compaction_threshold, - request_data.gc_horizon, - &request_data.gc_period, - request_data.image_creation_threshold, - &request_data.pitr_interval, - &request_data.walreceiver_connect_timeout, - &request_data.lagging_wal_timeout, - request_data.max_lsn_wal_lag, - request_data.trace_read_requests, - &request_data.eviction_policy, - request_data.min_resident_size_override, - &request_data.evictions_low_residence_duration_metric_threshold, - ) - } -} - -impl TryFrom<&'_ TenantConfigRequest> for TenantConfOpt { - type Error = anyhow::Error; - - fn try_from(request_data: &TenantConfigRequest) -> Result { - Self::from_request( - request_data.checkpoint_distance, - &request_data.checkpoint_timeout, - request_data.compaction_target_size, - &request_data.compaction_period, - request_data.compaction_threshold, - request_data.gc_horizon, - &request_data.gc_period, - request_data.image_creation_threshold, - &request_data.pitr_interval, - &request_data.walreceiver_connect_timeout, - &request_data.lagging_wal_timeout, - request_data.max_lsn_wal_lag, - request_data.trace_read_requests, - &request_data.eviction_policy, - request_data.min_resident_size_override, - &request_data.evictions_low_residence_duration_metric_threshold, - ) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 1144aee166..a2daebc6b4 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 1144aee1661c79eec65e784a8dad8bd450d9df79 +Subproject commit a2daebc6b445dcbcca9c18e1711f47c1db7ffb04 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 1984832c74..2df2ce3744 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 1984832c740a7fa0e468bb720f40c525b652835d +Subproject commit 2df2ce374464a7449e15dfa46c956b73b4f4098b From 7b9e8be6e47a022779d8c4af6a73284e50417018 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 17 May 2023 11:38:41 +0100 Subject: [PATCH 15/27] GitHub Autocomment: add a command to run all failed tests (#4200) - Group tests by Postgres version - Merge different build types - Add a command to GitHub comment on how to rerun all failed tests (different command for different Postgres versions) - Restore a link to a test report in the build summary --- .../actions/allure-report-generate/action.yml | 2 + scripts/pr-comment-test-report.js | 65 ++++++++++--------- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 07120c4c8a..7f7fa9e7a1 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -147,6 +147,8 @@ runs: echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT + echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} + - name: Release lock if: always() shell: bash -euxo pipefail {0} diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js index 287a1ea8e5..3a7bba0daa 100644 --- a/scripts/pr-comment-test-report.js +++ b/scripts/pr-comment-test-report.js @@ -36,11 +36,9 @@ module.exports = async ({ github, context, fetch, report }) => { // Marker to find the comment in the subsequent runs const startMarker = `` // Let users know that the comment is updated automatically - const autoupdateNotice = `
The comment gets automatically updated with the latest test results :recycle:
` + const autoupdateNotice = `
The comment gets automatically updated with the latest test results
${context.payload.pull_request.head.sha} at ${new Date().toISOString()} :recycle:
` // GitHub bot id taken from (https://api.github.com/users/github-actions[bot]) const githubActionsBotId = 41898282 - // The latest commit in the PR URL - const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.payload.pull_request.head.sha}` // Commend body itself let commentBody = `${startMarker}\n` @@ -74,7 +72,6 @@ module.exports = async ({ github, context, fetch, report }) => { let flakyTestsCount = 0 const pgVersions = new Set() - const buildTypes = new Set() for (const parentSuite of suites.children) { for (const suite of parentSuite.children) { @@ -92,28 +89,29 @@ module.exports = async ({ github, context, fetch, report }) => { } pgVersions.add(pgVersion) - buildTypes.add(buildType) // Removing build type and PostgreSQL version from the test name to make it shorter const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "") test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}` + test.pgVersion = pgVersion + test.buildType = buildType if (test.status === "passed") { - passedTests[pgVersion][buildType].push(test) + passedTests[pgVersion][testName].push(test) passedTestsCount += 1 } else if (test.status === "failed" || test.status === "broken") { - failedTests[pgVersion][buildType].push(test) + failedTests[pgVersion][testName].push(test) failedTestsCount += 1 } else if (test.status === "skipped") { - skippedTests[pgVersion][buildType].push(test) + skippedTests[pgVersion][testName].push(test) skippedTestsCount += 1 } if (test.retriesCount > 0) { - retriedTests[pgVersion][buildType].push(test) + retriedTests[pgVersion][testName].push(test) if (test.retriesStatusChange) { - flakyTests[pgVersion][buildType].push(test) + flakyTests[pgVersion][testName].push(test) flakyTestsCount += 1 } } @@ -122,39 +120,44 @@ module.exports = async ({ github, context, fetch, report }) => { } const totalTestsCount = failedTestsCount + passedTestsCount + skippedTestsCount - commentBody += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}) for ${commitUrl})\n___\n` + commentBody += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}))\n___\n` - // Print test resuls from the newest to the oldest PostgreSQL version for release and debug builds. + // Print test resuls from the newest to the oldest Postgres version for release and debug builds. for (const pgVersion of Array.from(pgVersions).sort().reverse()) { - for (const buildType of Array.from(buildTypes).sort().reverse()) { - if (failedTests[pgVersion][buildType].length > 0) { - commentBody += `#### PostgreSQL ${pgVersion} (${buildType} build)\n\n` - commentBody += `Failed tests:\n` - for (const test of failedTests[pgVersion][buildType]) { + if (Object.keys(failedTests[pgVersion]).length > 0) { + commentBody += `#### Failures on Posgres ${pgVersion}\n\n` + for (const [testName, tests] of Object.entries(failedTests[pgVersion])) { + const links = [] + for (const test of tests) { const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}` - - commentBody += `- [\`${test.pytestName}\`](${allureLink})` - if (test.retriesCount > 0) { - commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))` - } - commentBody += "\n" + links.push(`[${test.buildType}](${allureLink})`) } - commentBody += "\n" + commentBody += `- \`${testName}\`: ${links.join(", ")}\n` } + + const testsToRerun = Object.values(failedTests[pgVersion]).map(x => x[0].name) + const command = `DEFAULT_PG_VERSION=${pgVersion} scripts/pytest -k "${testsToRerun.join(" or ")}"` + + commentBody += "```\n" + commentBody += `# Run failed on Postgres ${pgVersion} tests locally:\n` + commentBody += `${command}\n` + commentBody += "```\n" } } if (flakyTestsCount > 0) { - commentBody += "
\nFlaky tests\n\n" + commentBody += `
\nFlaky tests (${flakyTestsCount})\n\n` for (const pgVersion of Array.from(pgVersions).sort().reverse()) { - for (const buildType of Array.from(buildTypes).sort().reverse()) { - if (flakyTests[pgVersion][buildType].length > 0) { - commentBody += `#### PostgreSQL ${pgVersion} (${buildType} build)\n\n` - for (const test of flakyTests[pgVersion][buildType]) { + if (Object.keys(flakyTests[pgVersion]).length > 0) { + commentBody += `#### Postgres ${pgVersion}\n\n` + for (const [testName, tests] of Object.entries(flakyTests[pgVersion])) { + const links = [] + for (const test of tests) { + const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries` const status = test.status === "passed" ? ":white_check_mark:" : ":x:" - commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n` + links.push(`[${status} ${test.buildType}](${allureLink})`) } - commentBody += "\n" + commentBody += `- \`${testName}\`: ${links.join(", ")}\n` } } } From 0c4dc55a3988dd549f88059c1e862acbb4fafc0c Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 8 May 2023 21:39:24 +0100 Subject: [PATCH 16/27] Disable recovery_prefetch for Neon hot standby. Prefetching of blocks referenced in WAL doesn't make sense for us, because Neon hot standby anyway ignores pages that are not in the shared_buffers. --- control_plane/src/endpoint.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 6431f66e1c..cc5a7a4168 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -134,6 +134,7 @@ pub struct Endpoint { // port and address of the Postgres server pub address: SocketAddr, + // postgres major version in the format: 14, 15, etc. pg_version: u32, // These are not part of the endpoint as such, but the environment @@ -381,6 +382,11 @@ impl Endpoint { conf.append("primary_conninfo", connstr.as_str()); conf.append("primary_slot_name", slot_name.as_str()); conf.append("hot_standby", "on"); + // prefetching of blocks referenced in WAL doesn't make sense for us + // Neon hot standby ignores pages that are not in the shared_buffers + if self.pg_version >= 15 { + conf.append("recovery_prefetch", "off"); + } } } From 9767432cff96360a2085a47d596f98ed27b5ec46 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Wed, 17 May 2023 09:48:00 -0400 Subject: [PATCH 17/27] add `cargo neon` shortcut for neon_local (#4240) Add `cargo neon` as a shortcut for compiling and running `neon_local`. --------- Signed-off-by: Alex Chi --- .cargo/config.toml | 1 + README.md | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index c40783bc1b..8fddaa2dd4 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -14,3 +14,4 @@ opt-level = 1 [alias] build_testing = ["build", "--features", "testing"] +neon = ["run", "--bin", "neon_local"] diff --git a/README.md b/README.md index 1b0164d5c0..8e6f2cda81 100644 --- a/README.md +++ b/README.md @@ -130,11 +130,11 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r ```sh # Create repository in .neon with proper paths to binaries and data # Later that would be responsibility of a package install script -> ./target/debug/neon_local init +> cargo neon init Starting pageserver at '127.0.0.1:64000' in '.neon'. # start pageserver, safekeeper, and broker for their intercommunication -> ./target/debug/neon_local start +> cargo neon start Starting neon broker at 127.0.0.1:50051 storage_broker started, pid: 2918372 Starting pageserver at '127.0.0.1:64000' in '.neon'. @@ -143,19 +143,19 @@ Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'. safekeeper 1 started, pid: 2918437 # create initial tenant and use it as a default for every future neon_local invocation -> ./target/debug/neon_local tenant create --set-default +> cargo neon tenant create --set-default tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one # start postgres compute node -> ./target/debug/neon_local endpoint start main +> cargo neon endpoint start main Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ... Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' # check list of running postgres instances -> ./target/debug/neon_local endpoint list +> cargo neon endpoint list ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running ``` @@ -177,22 +177,22 @@ postgres=# select * from t; 3. And create branches and run postgres on them: ```sh # create branch named migration_check -> ./target/debug/neon_local timeline branch --branch-name migration_check +> cargo neon timeline branch --branch-name migration_check Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main' # check branches tree -> ./target/debug/neon_local timeline list +> cargo neon timeline list (L) main [de200bd42b49cc1814412c7e592dd6e9] (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601] # start postgres on that branch -> ./target/debug/neon_local endpoint start migration_check --branch-name migration_check +> cargo neon endpoint start migration_check --branch-name migration_check Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ... Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres' # check the new list of running postgres instances -> ./target/debug/neon_local endpoint list +> cargo neon endpoint list ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running @@ -221,7 +221,7 @@ postgres=# select * from t; 4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances you have just started. You can terminate them all with one command: ```sh -> ./target/debug/neon_local stop +> cargo neon stop ``` ## Running tests From 918cd25453e8b57b11eb2ef36d07de73a4d3a9c4 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 17 May 2023 17:19:02 +0300 Subject: [PATCH 18/27] ondemand_download_large_rel: solve flakyness (#3697) Disable background tasks to not get compaction downloading all layers but also stop safekeepers before checkpointing, use a readonly endpoint. Fixes: #3666 Co-authored-by: Christian Schwarz --- test_runner/regress/test_ondemand_download.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 31f6c1f3d9..1414b4ed8e 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -64,12 +64,15 @@ def test_ondemand_download_large_rel( tenant, _ = env.neon_cli.create_tenant( conf={ # disable background GC - "gc_period": "10 m", + "gc_period": "0s", "gc_horizon": f"{10 * 1024 ** 3}", # 10 GB # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{10 * 1024 ** 2}", # 10 MB + # allow compaction with the checkpoint "compaction_threshold": "3", "compaction_target_size": f"{10 * 1024 ** 2}", # 10 MB + # but don't run compaction in background or on restart + "compaction_period": "0s", } ) env.initial_tenant = tenant @@ -96,9 +99,17 @@ def test_ondemand_download_large_rel( current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - # wait until pageserver receives that data wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + # stop endpoint before checkpoint to stop wal generation + endpoint.stop() + + # stopping of safekeepers now will help us not to calculate logical size + # after startup, so page requests should be the only one on-demand + # downloading the layers + for sk in env.safekeepers: + sk.stop() + # run checkpoint manually to be sure that data landed in remote storage client.timeline_checkpoint(tenant_id, timeline_id) @@ -107,7 +118,6 @@ def test_ondemand_download_large_rel( log.info("uploads have finished") ##### Stop the first pageserver instance, erase all its data - endpoint.stop() env.pageserver.stop() # remove all the layer files @@ -118,8 +128,13 @@ def test_ondemand_download_large_rel( ##### Second start, restore the data and ensure it's the same env.pageserver.start() - endpoint.start() + # start a readonly endpoint which we'll use to check the database. + # readonly (with lsn=) is required so that we don't try to connect to + # safekeepers, that have now been shut down. + endpoint = env.endpoints.create_start("main", lsn=current_lsn) + before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + assert before_downloads != 0, "basebackup should on-demand non-zero layers" # Probe in the middle of the table. There's a high chance that the beginning # and end of the table was stored together in the same layer files with data From 72346e102d719f9a57b285c291277c4a30713a36 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 17 May 2023 17:29:54 +0300 Subject: [PATCH 19/27] Document that our code is mostly not async cancellation-safe. We had a hot debate on whether we should try to make our code cancellation-safe, or just accept that it's not, and make sure that our Futures are driven to completion. The decision is that we drive Futures to completion. This documents the decision, and summarizes the reasoning for that. Discussion that sparked this: https://github.com/neondatabase/neon/pull/4198#discussion_r1190209316 --- docs/pageserver-thread-mgmt.md | 98 +++++++++++++++++++++++++++++----- libs/utils/src/seqwait.rs | 4 ++ 2 files changed, 89 insertions(+), 13 deletions(-) diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md index e351c972cb..0cc897f154 100644 --- a/docs/pageserver-thread-mgmt.md +++ b/docs/pageserver-thread-mgmt.md @@ -4,6 +4,11 @@ The pageserver uses Tokio for handling concurrency. Everything runs in Tokio tasks, although some parts are written in blocking style and use spawn_blocking(). +We currently use std blocking functions for disk I/O, however. The +current model is that we consider disk I/Os to be short enough that we +perform them while running in a Tokio task. Changing all the disk I/O +calls to async is a TODO. + Each Tokio task is tracked by the `task_mgr` module. It maintains a registry of tasks, and which tenant or timeline they are operating on. @@ -21,19 +26,86 @@ also a `shudown_watcher()` Future that can be used with `tokio::select!` or similar, to wake up on shutdown. -### Sync vs async +### Async cancellation safety -We use async to wait for incoming data on network connections, and to -perform other long-running operations. For example, each WAL receiver -connection is handled by a tokio Task. Once a piece of WAL has been -received from the network, the task calls the blocking functions in -the Repository to process the WAL. +In async Rust, futures can be "cancelled" at any await point, by +dropping the Future. For example, `tokio::select!` returns as soon as +one of the Futures returns, and drops the others. `tokio::timeout!` is +another example. In the Rust ecosystem, some functions are +cancellation-safe, meaning they can be safely dropped without +side-effects, while others are not. See documentation of +`tokio::select!` for examples. -The core storage code in `layered_repository/` is synchronous, with -blocking locks and I/O calls. The current model is that we consider -disk I/Os to be short enough that we perform them while running in a -Tokio task. If that becomes a problem, we should use `spawn_blocking` -before entering the synchronous parts of the code, or switch to using -tokio I/O functions. +In the pageserver and safekeeper, async code is *not* +cancellation-safe by default. Unless otherwise marked, any async +function that you call cannot be assumed to be async +cancellation-safe, and must be polled to completion. -Be very careful when mixing sync and async code! +The downside of non-cancellation safe code is that you have to be very +careful when using `tokio::select!`, `tokio::timeout!`, and other such +functions that can cause a Future to be dropped. They can only be used +with functions that are explicitly documented to be cancellation-safe, +or you need to spawn a separate task to shield from the cancellation. + +At the entry points to the code, we also take care to poll futures to +completion, or shield the rest of the code from surprise cancellations +by spawning a separate task. The code that handles incoming HTTP +requests, for example, spawns a separate task for each request, +because Hyper will drop the request-handling Future if the HTTP +connection is lost. (FIXME: our HTTP handlers do not do that +currently, but we should fix that. See [issue +3478](https://github.com/neondatabase/neon/issues/3478)). + + +#### How to cancel, then? + +If our code is not cancellation-safe, how do you cancel long-running +tasks? Use CancellationTokens. + +TODO: More details on that. And we have an ongoing discussion on what +to do if cancellations might come from multiple sources. + +#### Exceptions +Some library functions are cancellation-safe, and are explicitly marked +as such. For example, `utils::seqwait`. + +#### Rationale + +The alternative would be to make all async code cancellation-safe, +unless otherwise marked. That way, you could use `tokio::select!` more +liberally. The reasons we didn't choose that are explained in this +section. + +Writing code in a cancellation-safe manner is tedious, as you need to +scrutinize every `.await` and ensure that if the `.await` call never +returns, the system is in a safe, consistent state. In some ways, you +need to do that with `?` and early `returns`, too, but `.await`s are +easier to miss. It is also easier to perform cleanup tasks when a +function returns an `Err` than when an `.await` simply never +returns. You can use `scopeguard` and Drop guards to perform cleanup +tasks, but it is more tedious. An `.await` that never returns is more +similar to a panic. + +Note that even if you only use building blocks that themselves are +cancellation-safe, it doesn't mean that the code as whole is +cancellation-safe. For example, consider the following code: + +``` +while let Some(i) = work_inbox.recv().await { + if let Err(_) = results_outbox.send(i).await { + println!("receiver dropped"); + return; + } + } +} +``` + +It reads messages from one channel, sends them to another channel. If +this code is cancelled at the `results_outbox.send(i).await`, the +message read from the receiver is lost. That may or may not be OK, +depending on the context. + +Another reason to not require cancellation-safety is historical: we +already had a lot of async code that was not scrutinized for +cancellation-safety when this issue was raised. Scrutinizing all +existing code is no fun. diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index e3f0b505da..70cf4a1ce9 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -144,6 +144,8 @@ where /// /// This call won't complete until someone has called `advance` /// with a number greater than or equal to the one we're waiting for. + /// + /// This function is async cancellation-safe. pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), @@ -159,6 +161,8 @@ where /// /// If that hasn't happened after the specified timeout duration, /// [`SeqWaitError::Timeout`] will be returned. + /// + /// This function is async cancellation-safe. pub async fn wait_for_timeout( &self, num: V, From fc886dc8c0bf34f4921bdbee2e8e35f3a4f97788 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Wed, 17 May 2023 16:50:27 +0200 Subject: [PATCH 20/27] Compile pg_cron extension --- Dockerfile.compute-node | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index c18470c5e2..3a3dee8a8a 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -415,6 +415,23 @@ RUN apt-get update && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control +######################################################################################### +# +# Layer "pg-cron-pg-build" +# compile pg_cron extension +# +######################################################################################### +FROM build-deps AS pg-cron-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \ + echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \ + mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control + ######################################################################################### # # Layer "rust extensions" @@ -529,6 +546,7 @@ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From 8ebae74c6fe59332f32da4080f920ab89d90c85f Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 11 May 2023 15:50:22 +0100 Subject: [PATCH 21/27] Fix handling of XLOG_XACT_COMMIT/ABORT: Previously we didn't handle XACT_XINFO_HAS_INVALS and XACT_XINFO_HAS_DROPPED_STAT correctly, which led to getting incorrect value of twophase_xid for records with XACT_XINFO_HAS_TWOPHASE. This caused 'twophase file for xid {} does not exist' errors in test_isolation --- pageserver/src/walrecord.rs | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 7581140934..1a34168fed 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -379,17 +379,6 @@ impl XlXactParsedRecord { }); } } - if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 { - let nmsgs = buf.get_i32_le(); - for _i in 0..nmsgs { - let sizeof_shared_invalidation_message = 0; - buf.advance(sizeof_shared_invalidation_message); - } - } - if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 { - xid = buf.get_u32_le(); - trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE"); - } if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 { let nitems = buf.get_i32_le(); @@ -397,7 +386,23 @@ impl XlXactParsedRecord { "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}", nitems ); - //FIXME: do we need to handle dropped stats here? + let sizeof_xl_xact_stats_item = 12; + buf.advance((nitems * sizeof_xl_xact_stats_item).try_into().unwrap()); + } + + if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 { + let nmsgs = buf.get_i32_le(); + let sizeof_shared_invalidation_message = 16; + buf.advance( + (nmsgs * sizeof_shared_invalidation_message) + .try_into() + .unwrap(), + ); + } + + if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 { + xid = buf.get_u32_le(); + debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid); } XlXactParsedRecord { From 1b2ece37152b7b376510575037d753f399ef74e7 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 18 May 2023 19:56:09 +0100 Subject: [PATCH 22/27] Re-enable compatibility tests on Postgres 15 (#4274) - Enable compatibility tests for Postgres 15 - Also add `PgVersion::v_prefixed` property to return the version number with, _guess what,_ v-prefix! --- .github/actions/run-python-test-set/action.yml | 18 +++++++++--------- .github/workflows/build_and_test.yml | 12 +++++++----- test_runner/fixtures/neon_fixtures.py | 6 +++--- test_runner/fixtures/pg_version.py | 6 ++++++ test_runner/regress/test_compatibility.py | 18 ++++++++++-------- test_runner/regress/test_pg_regress.py | 8 ++++---- 6 files changed, 39 insertions(+), 29 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index bb120e9470..4493985587 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -71,12 +71,12 @@ runs: path: /tmp/neon-previous prefix: latest - - name: Download compatibility snapshot for Postgres 14 - if: inputs.build_type != 'remote' && inputs.pg_version == 'v14' + - name: Download compatibility snapshot + if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: compatibility-snapshot-${{ inputs.build_type }}-pg14 - path: /tmp/compatibility_snapshot_pg14 + name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }} + path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} prefix: latest - name: Checkout @@ -106,7 +106,7 @@ runs: BUILD_TYPE: ${{ inputs.build_type }} AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} - COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14 + COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') RERUN_FLAKY: ${{ inputs.rerun_flaky }} @@ -197,13 +197,13 @@ runs: scripts/generate_and_push_perf_report.sh fi - - name: Upload compatibility snapshot for Postgres 14 - if: github.ref_name == 'release' && inputs.pg_version == 'v14' + - name: Upload compatibility snapshot + if: github.ref_name == 'release' uses: ./.github/actions/upload with: - name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }} + name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }} # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test - path: /tmp/test_output/compatibility_snapshot_pg14/ + path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/ prefix: latest - name: Upload test results diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9114e02622..5d588aaa85 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -957,7 +957,7 @@ jobs: promote-compatibility-data: runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init needs: [ promote-images, tag, regress-tests ] if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch' @@ -968,11 +968,13 @@ jobs: PREFIX: artifacts/latest run: | # Update compatibility snapshot for the release - for build_type in debug release; do - OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst - NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst + for pg_version in v14 v15; do + for build_type in debug release; do + OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst + NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst - time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME} + time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME} + done done # Update Neon artifact for the release (reuse already uploaded artifact) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1a480e1b04..8ec17834ac 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -149,7 +149,7 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]: @pytest.fixture(scope="session") def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]: - versioned_dir = pg_distrib_dir / f"v{pg_version}" + versioned_dir = pg_distrib_dir / pg_version.v_prefixed psql_bin_path = versioned_dir / "bin/psql" postgres_bin_path = versioned_dir / "bin/postgres" @@ -1745,8 +1745,8 @@ class PgBin: def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion): self.log_dir = log_dir self.pg_version = pg_version - self.pg_bin_path = pg_distrib_dir / f"v{pg_version}" / "bin" - self.pg_lib_dir = pg_distrib_dir / f"v{pg_version}" / "lib" + self.pg_bin_path = pg_distrib_dir / pg_version.v_prefixed / "bin" + self.pg_lib_dir = pg_distrib_dir / pg_version.v_prefixed / "lib" self.env = os.environ.copy() self.env["LD_LIBRARY_PATH"] = str(self.pg_lib_dir) diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index 554f841d14..d67f088365 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -27,6 +27,12 @@ class PgVersion(str, enum.Enum): def __repr__(self) -> str: return f"'{self.value}'" + # In GitHub workflows we use Postgres version with v-prefix (e.g. v14 instead of just 14), + # sometime we need to do so in tests. + @property + def v_prefixed(self) -> str: + return f"v{self.value}" + @classmethod def _missing_(cls, value) -> Optional["PgVersion"]: known_values = {v.value for _, v in cls.__members__.items()} diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 7bc12847b7..fe8dc293c1 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -16,7 +16,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload -from fixtures.pg_version import PgVersion, skip_on_postgres +from fixtures.pg_version import PgVersion from fixtures.types import Lsn from pytest import FixtureRequest @@ -41,7 +41,6 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( ) -@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet") @pytest.mark.xdist_group("compatibility") @pytest.mark.order(before="test_forward_compatibility") def test_create_snapshot( @@ -49,12 +48,13 @@ def test_create_snapshot( pg_bin: PgBin, top_output_dir: Path, test_output_dir: Path, + pg_version: PgVersion, ): # The test doesn't really test anything # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`. # # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it. - neon_env_builder.pg_version = PgVersion.V14 + neon_env_builder.pg_version = pg_version neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_local_fs_remote_storage() neon_env_builder.preserve_database_files = True @@ -90,13 +90,14 @@ def test_create_snapshot( env.pageserver.stop() # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it - compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14" + compatibility_snapshot_dir = ( + top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}" + ) if compatibility_snapshot_dir.exists(): shutil.rmtree(compatibility_snapshot_dir) shutil.copytree(test_output_dir, compatibility_snapshot_dir) -@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet") @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") @@ -115,7 +116,7 @@ def test_backward_compatibility( compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR") assert ( compatibility_snapshot_dir_env is not None - ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" + ), f"COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg{pg_version.v_prefixed}` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() breaking_changes_allowed = ( @@ -155,7 +156,6 @@ def test_backward_compatibility( ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" -@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet") @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") @@ -183,7 +183,9 @@ def test_forward_compatibility( ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)" compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve() - compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14" + compatibility_snapshot_dir = ( + top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}" + ) breaking_changes_allowed = ( os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index a1d2a56d8a..505d8d4129 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -33,8 +33,8 @@ def test_pg_regress( (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress" - src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/regress" + build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/regress" + src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/regress" bindir = pg_distrib_dir / f"v{env.pg_version}/bin" schedule = src_path / "parallel_schedule" pg_regress = build_path / "pg_regress" @@ -97,8 +97,8 @@ def test_isolation( (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/isolation" - src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/isolation" + build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/isolation" + src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/isolation" bindir = pg_distrib_dir / f"v{env.pg_version}/bin" schedule = src_path / "isolation_schedule" pg_isolation_regress = build_path / "pg_isolation_regress" From 5abc4514b7698e903c7b8b151f6af5450b2385c4 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 18 May 2023 22:38:33 +0100 Subject: [PATCH 23/27] Un-xfail fixed tests on Postgres 15 (#4275) - https://github.com/neondatabase/neon/pull/4182 - https://github.com/neondatabase/neon/pull/4213 --- test_runner/regress/test_hot_standby.py | 2 -- test_runner/regress/test_pg_regress.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 582ac1b17e..12e034cea2 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,9 +1,7 @@ import pytest from fixtures.neon_fixtures import NeonEnv -from fixtures.pg_version import PgVersion, xfail_on_postgres -@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4182") @pytest.mark.timeout(1800) def test_hot_standby(neon_simple_env: NeonEnv): env = neon_simple_env diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 505d8d4129..d765316174 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -5,7 +5,6 @@ from pathlib import Path import pytest from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content -from fixtures.pg_version import PgVersion, xfail_on_postgres # Run the main PostgreSQL regression tests, in src/test/regress. @@ -72,7 +71,6 @@ def test_pg_regress( # # This runs for a long time, especially in debug mode, so use a larger-than-default # timeout. -@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4213") @pytest.mark.timeout(1800) def test_isolation( neon_simple_env: NeonEnv, From b391c94440de63a05d4f97e5e9d2a2ed18d74559 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 19 May 2023 03:16:09 +0200 Subject: [PATCH 24/27] tenant create / update-config: reject unknown fields (#4267) This PR enforces that the tenant create / update-config APIs reject requests with unknown fields. This is a desirable property because some tenant config settings control the lifetime of user data (e.g., GC horizon or PITR interval). Suppose we inadvertently rename the `pitr_interval` field in the Rust code. Then, right now, a client that still uses the old name will send a tenant config request to configure a new PITR interval. Before this PR, we would accept such a request, ignore the old name field, and use the pageserver.toml default value for what the new PITR interval is. With this PR, we will instead reject such a request. One might argue that the client could simply check whether the config it sent has been applied, using the `/v1/tenant/.../config` endpoint. That is correct for tenant create and update-config. But, attach will soon [^1] grow the ability to have attach-time config as well. If we ignore unknown fields and fall back to global defaults in that case, we risk data loss. Example: 1. Default PITR in pageservers is 7 days. 2. Create a tenant and set its PITR to 30 days. 3. For 30 days, fill the tenant continuously with data. 4. Detach the tenant. 5. Attach tenant. Attach must use the 30-day PITR setting in this scenario. If it were to fall back to the 7-day default value, we would lose 23 days of PITR capability for the tenant. So, the PR that adds attach-time tenant config will build on the (clunky) infrastructure added in this PR [^1]: https://github.com/neondatabase/neon/pull/4255 Implementation Notes ==================== This could have been a simple `#[serde(deny_unknown_fields)]` but sadly, that is documented- but silent-at-compile-time-incompatible with `#[serde(flatten)]`. But we are still using this by adding on outer struct and use unit tests to ensure it is correct. `neon_local tenant config` now uses the `.remove()` pattern + bail if there are leftover config args. That's in line with what `neon_local tenant create` does. We should dedupe that logic in a future PR. --------- Signed-off-by: Alex Chi Co-authored-by: Alex Chi --- control_plane/src/pageserver.rs | 46 +++++++++++++++---------- libs/pageserver_api/src/models.rs | 39 ++++++++++++++++++--- pageserver/src/http/openapi_spec.yml | 7 +++- test_runner/fixtures/pageserver/http.py | 7 +++- 4 files changed, 74 insertions(+), 25 deletions(-) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index f022be3910..6309494b71 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -393,69 +393,79 @@ impl PageServerNode { }) } - pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> { + pub fn tenant_config( + &self, + tenant_id: TenantId, + mut settings: HashMap<&str, &str>, + ) -> anyhow::Result<()> { let config = { // Braces to make the diff easier to read models::TenantConfig { checkpoint_distance: settings - .get("checkpoint_distance") + .remove("checkpoint_distance") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'checkpoint_distance' as an integer")?, - checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()), + checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()), compaction_target_size: settings - .get("compaction_target_size") + .remove("compaction_target_size") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_target_size' as an integer")?, - compaction_period: settings.get("compaction_period").map(|x| x.to_string()), + compaction_period: settings.remove("compaction_period").map(|x| x.to_string()), compaction_threshold: settings - .get("compaction_threshold") + .remove("compaction_threshold") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_threshold' as an integer")?, gc_horizon: settings - .get("gc_horizon") + .remove("gc_horizon") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_horizon' as an integer")?, - gc_period: settings.get("gc_period").map(|x| x.to_string()), + gc_period: settings.remove("gc_period").map(|x| x.to_string()), image_creation_threshold: settings - .get("image_creation_threshold") + .remove("image_creation_threshold") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_threshold' as non zero integer")?, - pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), + pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), walreceiver_connect_timeout: settings - .get("walreceiver_connect_timeout") + .remove("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings + .remove("lagging_wal_timeout") .map(|x| x.to_string()), - lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()), max_lsn_wal_lag: settings - .get("max_lsn_wal_lag") + .remove("max_lsn_wal_lag") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, trace_read_requests: settings - .get("trace_read_requests") + .remove("trace_read_requests") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'trace_read_requests' as bool")?, eviction_policy: settings - .get("eviction_policy") - .map(|x| serde_json::from_str(x)) + .remove("eviction_policy") + .map(serde_json::from_str) .transpose() .context("Failed to parse 'eviction_policy' json")?, min_resident_size_override: settings - .get("min_resident_size_override") + .remove("min_resident_size_override") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'min_resident_size_override' as an integer")?, evictions_low_residence_duration_metric_threshold: settings - .get("evictions_low_residence_duration_metric_threshold") + .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), } }; + if !settings.is_empty() { + bail!("Unrecognized tenant settings: {settings:?}") + } + self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))? .json(&models::TenantConfigRequest { tenant_id, config }) .send()? diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 0bcdb3c3a8..3bfedd14ea 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -131,13 +131,14 @@ pub struct TimelineCreateRequest { } #[serde_as] -#[derive(Serialize, Deserialize, Default)] +#[derive(Serialize, Deserialize, Debug, Default)] +#[serde(deny_unknown_fields)] pub struct TenantCreateRequest { #[serde(default)] #[serde_as(as = "Option")] pub new_tenant_id: Option, #[serde(flatten)] - pub config: TenantConfig, + pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it } impl std::ops::Deref for TenantCreateRequest { @@ -148,7 +149,7 @@ impl std::ops::Deref for TenantCreateRequest { } } -#[derive(Serialize, Deserialize, Default)] +#[derive(Serialize, Deserialize, Debug, Default)] pub struct TenantConfig { pub checkpoint_distance: Option, pub checkpoint_timeout: Option, @@ -192,12 +193,13 @@ impl TenantCreateRequest { } #[serde_as] -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] pub struct TenantConfigRequest { #[serde_as(as = "DisplayFromStr")] pub tenant_id: TenantId, #[serde(flatten)] - pub config: TenantConfig, + pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it } impl std::ops::Deref for TenantConfigRequest { @@ -768,4 +770,31 @@ mod tests { assert!(format!("{:?}", &original_broken.state).contains("reason")); assert!(format!("{:?}", &original_broken.state).contains("backtrace info")); } + + #[test] + fn test_reject_unknown_field() { + let id = TenantId::generate(); + let create_request = json!({ + "new_tenant_id": id.to_string(), + "unknown_field": "unknown_value".to_string(), + }); + let err = serde_json::from_value::(create_request).unwrap_err(); + assert!( + err.to_string().contains("unknown field `unknown_field`"), + "expect unknown field `unknown_field` error, got: {}", + err + ); + + let id = TenantId::generate(); + let config_request = json!({ + "tenant_id": id.to_string(), + "unknown_field": "unknown_value".to_string(), + }); + let err = serde_json::from_value::(config_request).unwrap_err(); + assert!( + err.to_string().contains("unknown field `unknown_field`"), + "expect unknown field `unknown_field` error, got: {}", + err + ); + } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 62664733ea..0d09603650 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -741,8 +741,11 @@ paths: $ref: "#/components/schemas/Error" post: description: | - Create a tenant. Returns new tenant id on success.\ + Create a tenant. Returns new tenant id on success. + If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant. + + Invalid fields in the tenant config will cause the request to be rejected with status 400. requestBody: content: application/json: @@ -790,6 +793,8 @@ paths: put: description: | Update tenant's config. + + Invalid fields in the tenant config will cause the request to be rejected with status 400. requestBody: content: application/json: diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 1ff057fae2..1349923cc4 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -149,11 +149,16 @@ class PageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId: + def tenant_create( + self, new_tenant_id: Optional[TenantId] = None, conf: Optional[Dict[str, Any]] = None + ) -> TenantId: + if conf is not None: + assert "new_tenant_id" not in conf.keys() res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ "new_tenant_id": str(new_tenant_id) if new_tenant_id else None, + **(conf or {}), }, ) self.verbose_error(res) From 7529ee2ec709778ee90144ca154edf7fd74851d9 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 19 May 2023 14:35:33 +0300 Subject: [PATCH 25/27] rfc: the state of pageserver tenant relocation (#3868) Summarize current state of tenant relocation related activities and implementation ideas --- ...e-state-of-pageserver-tenant-relocation.md | 232 ++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md diff --git a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md new file mode 100644 index 0000000000..9f22fc1ee4 --- /dev/null +++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md @@ -0,0 +1,232 @@ +# The state of pageserver tenant relocation + +Created on 17.03.23 + +## Motivation + +There were previous write ups on the subject. The design of tenant relocation was planned at the time when we had quite different landscape. I e there was no on-demand download/eviction. They were on the horizon but we still planned for cases when they were not available. Some other things have changed. Now safekeepers offload wal to s3 so we're not risking overflowing their disks. Having all of the above, it makes sense to recap and take a look at the options we have now, which adjustments we'd like to make to original process, etc. + +Related (in chronological order): + +- Tracking issue with initial discussion: [#886](https://github.com/neondatabase/neon/issues/886) +- [015. Storage Messaging](015-storage-messaging.md) +- [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) + +## Summary + +The RFC consists of a walkthrough of prior art on tenant relocation and corresponding problems. It describes 3 approaches. + +1. Simplistic approach that uses ignore and is the fastest to implement. The main downside is a requirement of short downtime. +2. More complicated approach that avoids even short downtime. +3. Even more complicated approach that will allow multiple pageservers to operate concurrently on the same tenant possibly allowing for HA cluster topologies and horizontal scaling of reads (i e compute talks to multiple pageservers). + +The order in which solutions are described is a bit different. We start from 2, then move to possible compromises (aka simplistic approach) and then move to discussing directions for solving HA/Pageserver replica case with 3. + +## Components + +pageserver, control-plane, safekeepers (a bit) + +## Requirements + +Relocation procedure should move tenant from one pageserver to another without downtime introduced by storage side. For now restarting compute for applying new configuration is fine. + +- component restarts +- component outage +- pageserver loss + +## The original proposed implementation + +The starting point is this sequence: + +```mermaid +sequenceDiagram + autonumber + participant CP as Control Plane + participant PS1 as Pageserver 1 + participant PS2 as Pageserver 2 + participant S3 + + CP->>PS2: Attach tenant X + PS2->>S3: Fetch timelines, indexes for them + PS2->>CP: Accepted + CP->>CP: Change pageserver id in project + CP->>PS1: Detach +``` + +Which problems do we have with naive approach? + +### Concurrent GC and Compaction + +The problem is that they can run on both, PS1 and PS2. Consider this example from [Pageserver S3 Coordination RFC](020-pageserver-s3-coordination.md) + +```mermaid +sequenceDiagram + autonumber + participant PS1 + participant S3 + participant PS2 + + PS1->>S3: Uploads L1, L2
Index contains L1 L2 + PS2->>S3: Attach called, sees L1, L2 + PS1->>S3: Compaction comes
Removes L1, adds L3 + note over S3: Index now L2, L3 + PS2->>S3: Uploads new layer L4
(added to previous view of the index) + note over S3: Index now L1, L2, L4 +``` + +At this point it is not possible to restore the state from index, it contains L2 which +is no longer available in s3 and doesnt contain L3 added by compaction by the +first pageserver. So if any of the pageservers restart, initial sync will fail +(or in on-demand world it will fail a bit later during page request from +missing layer) + +The problem lies in shared index_part.json. Having intersecting layers from append only edits is expected to work, though this is an uncharted territory without tests. + +#### Options + +There are several options on how to restrict concurrent access to index file. + +First and the simplest one is external orchestration. Control plane which runs migration can use special api call on pageserver to stop background processes (gc, compaction), and even possibly all uploads. + +So the sequence becomes: + +```mermaid +sequenceDiagram + autonumber + participant CP as Control Plane + participant PS1 as Pageserver 1 + participant PS2 as Pageserver 2 + participant S3 + + CP->>PS1: Pause background jobs, pause uploading new layers. + CP->>PS2: Attach tenant X. + PS2->>S3: Fetch timelines, index, start background operations + PS2->>CP: Accepted + CP->>CP: Monitor PS2 last record lsn, ensure OK lag + CP->>CP: Change pageserver id in project + CP->>PS1: Detach +``` + +The downside of this sequence is the potential rollback process. What if something goes wrong on new pageserver? Can we safely roll back to source pageserver? + +There are two questions: + +#### How can we detect that something went wrong? + +We can run usual availability check (consists of compute startup and an update of one row). +Note that we cant run separate compute for that before touching compute that client runs actual workload on, because we cant have two simultaneous computes running in read-write mode on the same timeline (enforced by safekeepers consensus algorithm). So we can either run some readonly check first (basebackup) and then change pageserver id and run availability check. If it failed we can roll it back to the old one. + +#### What can go wrong? And how we can safely roll-back? + +In the sequence above during attach we start background processes/uploads. They change state in remote storage so it is possible that after rollback remote state will be different from one that was observed by source pageserver. So if target pageserver goes wild then source pageserver may fail to start with changed remote state. + +Proposed option would be to implement a barrier (read-only) mode when pageserver does not update remote state. + +So the sequence for happy path becomes this one: + +```mermaid +sequenceDiagram + autonumber + participant CP as Control Plane + participant PS1 as Pageserver 1 + participant PS2 as Pageserver 2 + participant S3 + + CP->>PS1: Pause background jobs, pause uploading new layers. + CP->>PS2: Attach tenant X in remote readonly mode. + PS2->>S3: Fetch timelines, index + PS2->>CP: Accepted + CP->>CP: Monitor PS2 last record lsn, ensure OK lag + CP->>CP: Change pageserver id in project + CP->>CP: Run successful availability check + CP->>PS2: Start uploads, background tasks + CP->>PS1: Detach +``` + +With this sequence we restrict any changes to remote storage to one pageserver. So there is no concurrent access at all, not only for index_part.json, but for everything else too. This approach makes it possible to roll back after failure on new pageserver. + +The sequence with roll back process: + +```mermaid +sequenceDiagram + autonumber + participant CP as Control Plane + participant PS1 as Pageserver 1 + participant PS2 as Pageserver 2 + participant S3 + + CP->>PS1: Pause background jobs, pause uploading new layers. + CP->>PS2: Attach tenant X in remote readonly mode. + PS2->>S3: Fetch timelines, index + PS2->>CP: Accepted + CP->>CP: Monitor PS2 last record lsn, ensure OK lag + CP->>CP: Change pageserver id in project + CP->>CP: Availability check Failed + CP->>CP: Change pageserver id back + CP->>PS1: Resume remote operations + CP->>PS2: Ignore (instead of detach for investigation purposes) +``` + +## Concurrent branch creation + +Another problem is a possibility of concurrent branch creation calls. + +I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state. + +## Simplistic approach + +The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid. + +The approach largely follows this guide: + +The happy path sequence: + +```mermaid +sequenceDiagram + autonumber + participant CP as Control Plane + participant PS1 as Pageserver 1 + participant PS2 as Pageserver 2 + participant SK as Safekeeper + participant S3 + + CP->>CP: Enable maintenance mode + CP->>PS1: Ignore + CP->>PS2: Attach + PS2->>CP: Accepted + loop Delete layers for each timeline + CP->>PS2: Get last record lsn + CP->>SK: Get commit lsn + CP->>CP: OK? Timed out? + end + CP->>CP: Change pageserver id in project + CP->>CP: Run successful availability check + CP->>CP: Disable maintenance mode + CP->>PS1: Detach ignored +``` + +The sequence contains exactly the same rollback problems as in previous approach described above. They can be resolved the same way. + +Most probably we'd like to move forward without this safety measure and implement it on top of this approach to make progress towards the downtime-less one. + +## Lease based approach + +In order to allow for concurrent operation on the same data on remote storage for multiple pageservers we need to go further than external orchestration. + +NOTE: [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) discusses one more approach that relies on duplication of index_part.json for each pageserver operating on the timeline. This approach still requires external coordination which makes certain things easier but requires additional bookkeeping to account for multiple index_part.json files. Discussion/comparison with proposed lease based approach + +The problems are outlined in [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) and suggested solution includes [Coordination based approach](020-pageserver-s3-coordination.md#coordination-based-approach). This way it will allow to do basic leader election for pageservers so they can decide which node will be responsible for running GC and compaction. The process is based on extensive communication via storage broker and consists of a lease that is taken by one of the pageservers that extends it to continue serving a leader role. + +There are two options for ingesting new data into pageserver in follower role. One option is to avoid WAL ingestion at all and rely on notifications from leader to discover new layers on s3. Main downside of this approach is that follower will always lag behind the primary node because it wont have the last layer until it is uploaded to remote storage. In case of a primary failure follower will be required to reingest last segment (up to 256Mb of WAL currently) which slows down recovery. Additionally if compute is connected to follower pageserver it will observe latest data with a delay. Queries from compute will likely experience bigger delays when recent lsn is required. + +The second option is to consume WAL stream on both pageservers. In this case the only problem is non deterministic layer generation. Additional bookkeeping will be required to deduplicate layers from primary with local ones. Some process needs to somehow merge them to remove duplicated data. Additionally we need to have good testing coverage to ensure that our implementation of `get_page@lsn` properly handles intersecting layers. + +There is another tradeoff. Approaches may be different in amount of traffic between system components. With first approach there can be increased traffic between follower and remote storage. But only in case follower has some activity that actually requests pages (!). With other approach traffic increase will be permanent and will be caused by two WAL streams instead of one. + +## Summary + +Proposed implementation strategy: + +Go with the simplest approach for now. Then work on tech debt, increase test coverage. Then gradually move forward to second approach by implementing safety measures first, finishing with switch of order between ignore and attach operation. + +And only then go to lease based approach to solve HA/Pageserver replica use cases. From 3837fca7a2ee85be6f395557fb52d3fe17daf7e7 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 19 May 2023 15:34:22 +0100 Subject: [PATCH 26/27] compute-node-image: fix postgis download (#4280) ## Problem `osgeo.org` is experiencing some problems with DNS resolving which breaks `compute-node-image` (because it can't download postgis) ## Summary of changes - Add `140.211.15.30 download.osgeo.org` to /etc/hosts by passing it via the container option --- .github/workflows/build_and_test.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5d588aaa85..564251ef8f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -711,7 +711,11 @@ jobs: compute-node-image: runs-on: [ self-hosted, gen3, large ] - container: gcr.io/kaniko-project/executor:v1.9.2-debug + container: + image: gcr.io/kaniko-project/executor:v1.9.2-debug + # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution."" + # Should be prevented by https://github.com/neondatabase/neon/issues/4281 + options: --add-host=download.osgeo.org:140.211.15.30 needs: [ tag ] strategy: fail-fast: false From 63884543758118b1e12b3654ae9bed57be570b39 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 22 May 2023 11:59:54 +0300 Subject: [PATCH 27/27] test: allow benign warning in relation to startup ordering (#4262) Allow the warning which happens because the disk usage based eviction runs before tenants are loaded. Example failure: https://neon-github-public-dev.s3.amazonaws.com/reports/main/5001582237/index.html#suites/0e58fb04d9998963e98e45fe1880af7d/a711f5baf8f8bd8d/ --- test_runner/regress/test_disk_usage_eviction.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index e8ec657683..ab67518092 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -118,6 +118,11 @@ class EvictionEnv: wait_until(10, 1, statvfs_called) + # these can sometimes happen during startup before any tenants have been + # loaded, so nothing can be evicted, we just wait for next iteration which + # is able to evict. + self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*") + @pytest.fixture def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv: