From 78aca668d08190934708f864fbddd14da316e8a8 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 19:31:53 +0200 Subject: [PATCH 1/6] fix: log download failed error (#3661) Fixes #3659 --- pageserver/src/tenant/tasks.rs | 2 +- pageserver/src/tenant/timeline.rs | 1 + test_runner/regress/test_tenants.py | 4 ++++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index e9ce52d1ab..20d1d2bfb6 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -74,7 +74,7 @@ async fn compaction_loop(tenant_id: TenantId) { let period = tenant.get_compaction_period(); // TODO: we shouldn't need to await to find tenant and this could be moved outside of - // loop + // loop, #3501. There are also additional "allowed_errors" in tests. if first { first = false; if random_init_delay(period, &cancel).await.is_err() { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f2b0a98509..8bc02cd10a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3745,6 +3745,7 @@ impl Timeline { remote_layer.ongoing_download.close(); } else { // Keep semaphore open. We'll drop the permit at the end of the function. + info!("on-demand download failed: {:?}", result.as_ref().unwrap_err()); } // Don't treat it as an error if the task that triggered the download diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index e56bb1b469..9e75396799 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -250,6 +250,10 @@ def test_pageserver_with_empty_tenants( env.pageserver.allowed_errors.append( ".*could not load tenant.*Failed to list timelines directory.*" ) + # this is until #3501 + env.pageserver.allowed_errors.append( + ".*Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant" + ) client = env.pageserver.http_client() From 15273a9b669c75b8a5bcda186715e7c1a940bfd6 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 20:20:13 +0200 Subject: [PATCH 2/6] chore: ignore all compaction inactive tenant errors (#3665) these are happening in tests because of #3655 but they sure took some time to appear. makes the `Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant` into a globally allowed error, because it has been seen failing on different test cases. --- test_runner/fixtures/neon_fixtures.py | 2 ++ test_runner/regress/test_tenants.py | 4 ---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 63196609cc..73f224039e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2080,6 +2080,8 @@ class NeonPageserver(PgProtocol): ".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock() ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress ".*task iteration took longer than the configured period.*", + # this is until #3501 + ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant", ] def start( diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 9e75396799..e56bb1b469 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -250,10 +250,6 @@ def test_pageserver_with_empty_tenants( env.pageserver.allowed_errors.append( ".*could not load tenant.*Failed to list timelines directory.*" ) - # this is until #3501 - env.pageserver.allowed_errors.append( - ".*Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant" - ) client = env.pageserver.http_client() From 43bf6d0a0f2695074f360bbc3b1deea2065f6321 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 21:09:31 +0200 Subject: [PATCH 3/6] calculate_logical_size: no longer use spawn_blocking (#3664) Calculation of logical size is now async because of layer downloads, so we shouldn't use spawn_blocking for it. Use of `spawn_blocking` exhausted resources which are needed by `tokio::io::copy` when copying from a stream to a file which lead to deadlock. Fixes: #3657 --- pageserver/src/tenant/timeline.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8bc02cd10a..176eb61ff3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1770,15 +1770,9 @@ impl Timeline { let calculation = async { let cancel = cancel.child_token(); let ctx = ctx.attached_child(); - tokio::task::spawn_blocking(move || { - // Run in a separate thread since this can do a lot of - // synchronous file IO without .await inbetween - // if there are no RemoteLayers that would require downloading. - let h = tokio::runtime::Handle::current(); - h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx)) - }) - .await - .context("Failed to spawn calculation result task")? + self_calculation + .calculate_logical_size(init_lsn, cancel, &ctx) + .await }; let timeline_state_cancellation = async { loop { @@ -1811,7 +1805,7 @@ impl Timeline { tokio::pin!(calculation); loop { tokio::select! { - res = &mut calculation => { return res } + res = &mut calculation => { return res } reason = timeline_state_cancellation => { debug!(reason = reason, "cancelling calculation"); cancel.cancel(); From a51b269f1502b69c5b2720d0a0e20b1e702ebd58 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 21:14:08 +0200 Subject: [PATCH 4/6] fix: hold permit until GetObject eof (#3663) previously we applied the ratelimiting only up to receiving the headers from s3, or somewhere near it. the commit adds an adapter which carries the permit until the AsyncRead has been disposed. fixes #3662. --- Cargo.lock | 1 + libs/remote_storage/Cargo.toml | 2 +- libs/remote_storage/src/s3_bucket.rs | 45 +++++++++++++++++++++++----- 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d154b4eaea..dab3d12263 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3054,6 +3054,7 @@ dependencies = [ "hyper", "metrics", "once_cell", + "pin-project-lite", "serde", "serde_json", "tempfile", diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 4382fbac32..15812e8439 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -21,7 +21,7 @@ toml_edit.workspace = true tracing.workspace = true metrics.workspace = true utils.workspace = true - +pin-project-lite.workspace = true workspace_hack.workspace = true [dev-dependencies] diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 18a2c5dedd..93f5e0596e 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -20,7 +20,10 @@ use aws_sdk_s3::{ }; use aws_smithy_http::body::SdkBody; use hyper::Body; -use tokio::{io, sync::Semaphore}; +use tokio::{ + io::{self, AsyncRead}, + sync::Semaphore, +}; use tokio_util::io::ReaderStream; use tracing::debug; @@ -102,7 +105,7 @@ pub struct S3Bucket { // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. // The helps to ensure we don't exceed the thresholds. - concurrency_limiter: Semaphore, + concurrency_limiter: Arc, } #[derive(Default)] @@ -162,7 +165,7 @@ impl S3Bucket { client, bucket_name: aws_config.bucket_name.clone(), prefix_in_bucket, - concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()), + concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())), }) } @@ -194,9 +197,10 @@ impl S3Bucket { } async fn download_object(&self, request: GetObjectRequest) -> Result { - let _guard = self + let permit = self .concurrency_limiter - .acquire() + .clone() + .acquire_owned() .await .context("Concurrency limiter semaphore got closed during S3 download") .map_err(DownloadError::Other)?; @@ -217,9 +221,10 @@ impl S3Bucket { let metadata = object_output.metadata().cloned().map(StorageMetadata); Ok(Download { metadata, - download_stream: Box::pin(io::BufReader::new( + download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new( + permit, object_output.body.into_async_read(), - )), + ))), }) } Err(SdkError::ServiceError { @@ -240,6 +245,32 @@ impl S3Bucket { } } +pin_project_lite::pin_project! { + /// An `AsyncRead` adapter which carries a permit for the lifetime of the value. + struct RatelimitedAsyncRead { + permit: tokio::sync::OwnedSemaphorePermit, + #[pin] + inner: S, + } +} + +impl RatelimitedAsyncRead { + fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self { + RatelimitedAsyncRead { permit, inner } + } +} + +impl AsyncRead for RatelimitedAsyncRead { + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut io::ReadBuf<'_>, + ) -> std::task::Poll> { + let this = self.project(); + this.inner.poll_read(cx, buf) + } +} + #[async_trait::async_trait] impl RemoteStorage for S3Bucket { async fn list(&self) -> anyhow::Result> { From 38cd90dd0c85ff6286e689cae33a95fe095ffca0 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Tue, 21 Feb 2023 21:11:52 +0100 Subject: [PATCH 5/6] Add -v to ansible invocations (#3670) To get more debug output on failures --- .github/workflows/deploy-dev.yml | 2 +- .github/workflows/deploy-prod.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml index 409517bf63..b080a29f7c 100644 --- a/.github/workflows/deploy-dev.yml +++ b/.github/workflows/deploy-dev.yml @@ -67,7 +67,7 @@ jobs: ./get_binaries.sh ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} + ansible-playbook -v deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} rm -f neon_install.tar.gz .neon_current_version - name: Cleanup ansible folder diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml index 540d187274..6096ac8ab9 100644 --- a/.github/workflows/deploy-prod.yml +++ b/.github/workflows/deploy-prod.yml @@ -68,7 +68,7 @@ jobs: ./get_binaries.sh ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} + ansible-playbook -v deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} rm -f neon_install.tar.gz .neon_current_version deploy-proxy-prod-new: From 46cc8b7982b270796d6266861801b2e466319968 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 22 Feb 2023 12:55:41 +0300 Subject: [PATCH 6/6] Remove safekeeper-1.ap-southeast-1.aws.neon.tech (#3671) We migrated all timelines to `safekeeper-3.ap-southeast-1.aws.neon.tech`, now old instance can be removed. --- .github/ansible/prod.ap-southeast-1.hosts.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index 13b44f4052..8ccb67b04a 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -32,8 +32,6 @@ storage: hosts: safekeeper-0.ap-southeast-1.aws.neon.tech: ansible_host: i-0d6f1dc5161eef894 - safekeeper-1.ap-southeast-1.aws.neon.tech: - ansible_host: i-0e338adda8eb2d19f safekeeper-2.ap-southeast-1.aws.neon.tech: ansible_host: i-04fb63634e4679eb9 safekeeper-3.ap-southeast-1.aws.neon.tech: