From 768a580373f1a60fff77a8e385fef040b9c261ef Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 29 Apr 2025 15:07:23 +0100 Subject: [PATCH 01/40] pageserver: add not modified since lsn to get page span (#11774) It's useful when debugging. --- pageserver/src/page_service.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d1a210a786..0ce1a99681 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1035,10 +1035,25 @@ impl PageServerHandler { // avoid a somewhat costly Span::record() by constructing the entire span in one go. macro_rules! mkspan { (before shard routing) => {{ - tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn) + tracing::info_span!( + parent: &parent_span, + "handle_get_page_request", + rel = %req.rel, + blkno = %req.blkno, + req_lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since + ) }}; ($shard_id:expr) => {{ - tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id) + tracing::info_span!( + parent: &parent_span, + "handle_get_page_request", + rel = %req.rel, + blkno = %req.blkno, + req_lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since, + shard_id = %$shard_id + ) }}; } @@ -1102,6 +1117,7 @@ impl PageServerHandler { shard_id = %shard.get_shard_identity().shard_slug(), timeline_id = %timeline_id, lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since, request_id = %req.hdr.reqid, key = %key, ) From a2adc7dbd380fadaa60255eac2c070243a01a7fe Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 29 Apr 2025 16:31:52 +0100 Subject: [PATCH 02/40] storcon: avoid multiple initdbs when shard 0 has stale locations (#11760) ## Problem In #11727 I overlooked the case of multiple attached locations for shard 0. I misread the code and thought `create_one` acts on one location, but it actually acts on one _shard_, which is potentially multiple locations. This was not a regression, but it meant that the fix was incomplete. ## Summary of changes - In `create_one`, when updating shard zero, have any "other" locations use the initdb from shard 0 --- storage_controller/src/service.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 50ce559cc0..72379f0810 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -3663,7 +3663,7 @@ impl Service { locations: ShardMutationLocations, http_client: reqwest::Client, jwt: Option, - create_req: TimelineCreateRequest, + mut create_req: TimelineCreateRequest, ) -> Result { let latest = locations.latest.node; @@ -3682,6 +3682,15 @@ impl Service { .await .map_err(|e| passthrough_api_error(&latest, e))?; + // If we are going to create the timeline on some stale locations for shard 0, then ask them to re-use + // the initdb generated by the latest location, rather than generating their own. This avoids racing uploads + // of initdb to S3 which might not be binary-identical if different pageservers have different postgres binaries. + if tenant_shard_id.is_shard_zero() { + if let models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } = &mut create_req.mode { + *existing_initdb_timeline_id = Some(create_req.new_timeline_id); + } + } + // We propagate timeline creations to all attached locations such that a compute // for the new timeline is able to start regardless of the current state of the // tenant shard reconciliation. From a08c1a23eb378be1b5b22a19737b0c7f855d30a2 Mon Sep 17 00:00:00 2001 From: Elizabeth Murray <52375559+bizwark@users.noreply.github.com> Date: Tue, 29 Apr 2025 09:50:18 -0700 Subject: [PATCH 03/40] Upgrade the pgrag version in the compute Dockerfile. (#11687) Update the compute Dockerfile to use a new version of pgrag. The new version of pgrag uses the latest pgrx, and has a fix that terminates background workers on postmaster exit. --- compute/compute-node.Dockerfile | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index b9299eee90..267940a405 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1084,7 +1084,18 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root +######################################################################################### +# +# Layer "rust extensions pgrx14" +# +######################################################################################### +FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14 +ARG PG_VERSION +RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ + /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' + +USER root ######################################################################################### # # Layers "pg-onnx-build" and "pgrag-build" @@ -1100,11 +1111,11 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar. mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ echo "#nothing to test here" > neon-test.sh -RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \ - echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz && \ + echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \ mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . -FROM rust-extensions-build-pgrx12 AS pgrag-build +FROM rust-extensions-build-pgrx14 AS pgrag-build COPY --from=pgrag-src /ext-src/ /ext-src/ # Install build-time dependencies @@ -1124,19 +1135,19 @@ RUN . venv/bin/activate && \ WORKDIR /ext-src/pgrag-src RUN cd exts/rag && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control RUN cd exts/rag_bge_small_en_v15 && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \ cargo pgrx install --release --features remote_onnx && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control RUN cd exts/rag_jina_reranker_v1_tiny_en && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \ cargo pgrx install --release --features remote_onnx && \ From 1d06172d59caa81e36dce0a17610d80067be4c54 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 29 Apr 2025 19:34:56 +0100 Subject: [PATCH 04/40] pageserver: remove resident size from billing metrics (#11699) This is a rebase of PR #10739 by @henryliu2014 on the current main branch. ## Problem pageserver: remove resident size from billing metrics Fixes #10388 ## Summary of changes The following changes have been made to remove resident size from billing metrics: * removed the metric "resident_size" and related codes in consumption_metrics/metrics.rs * removed the item of the description of metric "resident_size" in consumption_metrics.md * refactored the metric "resident_size" related test case Requested by: John Spray (john@neon.tech) --------- Co-authored-by: liuheqing Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: John Spray --- docs/consumption_metrics.md | 5 --- pageserver/src/consumption_metrics/metrics.rs | 42 ++----------------- .../src/consumption_metrics/metrics/tests.rs | 10 +---- pageserver/src/consumption_metrics/upload.rs | 8 +--- .../test_pageserver_metric_collection.py | 1 - 5 files changed, 7 insertions(+), 59 deletions(-) diff --git a/docs/consumption_metrics.md b/docs/consumption_metrics.md index 6bcd28ab10..eb211af646 100644 --- a/docs/consumption_metrics.md +++ b/docs/consumption_metrics.md @@ -38,11 +38,6 @@ Currently, the following metrics are collected: Amount of WAL produced , by a timeline, i.e. last_record_lsn This is an absolute, per-timeline metric. -- `resident_size` - -Size of all the layer files in the tenant's directory on disk on the pageserver. -This is an absolute, per-tenant metric. - - `remote_storage_size` Size of the remote storage (S3) directory. diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 08ab69f349..acdf514101 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -30,9 +30,6 @@ pub(super) enum Name { /// Tenant remote size #[serde(rename = "remote_storage_size")] RemoteSize, - /// Tenant resident size - #[serde(rename = "resident_size")] - ResidentSize, /// Tenant synthetic size #[serde(rename = "synthetic_storage_size")] SyntheticSize, @@ -187,18 +184,6 @@ impl MetricsKey { .absolute_values() } - /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`. - /// - /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size - const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory { - MetricsKey { - tenant_id, - timeline_id: None, - metric: Name::ResidentSize, - } - .absolute_values() - } - /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`]. /// /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size @@ -261,10 +246,7 @@ where let mut tenants = std::pin::pin!(tenants); while let Some((tenant_id, tenant)) = tenants.next().await { - let mut tenant_resident_size = 0; - let timelines = tenant.list_timelines(); - let timelines_len = timelines.len(); for timeline in timelines { let timeline_id = timeline.timeline_id; @@ -287,16 +269,9 @@ where continue; } } - - tenant_resident_size += timeline.resident_physical_size(); } - if timelines_len == 0 { - // Force set it to 1 byte to avoid not being reported -- all timelines are offloaded. - tenant_resident_size = 1; - } - - let snap = TenantSnapshot::collect(&tenant, tenant_resident_size); + let snap = TenantSnapshot::collect(&tenant); snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics); } @@ -305,19 +280,14 @@ where /// In-between abstraction to allow testing metrics without actual Tenants. struct TenantSnapshot { - resident_size: u64, remote_size: u64, synthetic_size: u64, } impl TenantSnapshot { /// Collect tenant status to have metrics created out of it. - /// - /// `resident_size` is calculated of the timelines we had access to for other metrics, so we - /// cannot just list timelines here. - fn collect(t: &Arc, resident_size: u64) -> Self { + fn collect(t: &Arc) -> Self { TenantSnapshot { - resident_size, remote_size: t.remote_size(), // Note that this metric is calculated in a separate bgworker // Here we only use cached value, which may lag behind the real latest one @@ -334,8 +304,6 @@ impl TenantSnapshot { ) { let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size); - let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size); - let synthetic_size = { let factory = MetricsKey::synthetic_size(tenant_id); let mut synthetic_size = self.synthetic_size; @@ -355,11 +323,7 @@ impl TenantSnapshot { } }; - metrics.extend( - [Some(remote_size), Some(resident_size), synthetic_size] - .into_iter() - .flatten(), - ); + metrics.extend([Some(remote_size), synthetic_size].into_iter().flatten()); } } diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 52b4fb8680..5cfb361e40 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -224,7 +224,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() { let tenant_id = TenantId::generate(); let ts = TenantSnapshot { - resident_size: 1000, remote_size: 1000, // not yet calculated synthetic_size: 0, @@ -245,7 +244,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() { metrics, &[ MetricsKey::remote_storage_size(tenant_id).at(now, 1000), - MetricsKey::resident_size(tenant_id).at(now, 1000), MetricsKey::synthetic_size(tenant_id).at(now, 1000), ] ); @@ -256,7 +254,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() { let tenant_id = TenantId::generate(); let ts = TenantSnapshot { - resident_size: 1000, remote_size: 1000, // not yet calculated synthetic_size: 0, @@ -274,7 +271,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() { metrics, &[ MetricsKey::remote_storage_size(tenant_id).at(now, 1000), - MetricsKey::resident_size(tenant_id).at(now, 1000), // no synthetic size here ] ); @@ -295,14 +291,13 @@ pub(crate) const fn metric_examples_old( timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [RawMetric; 6] { +) -> [RawMetric; 5] { [ MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id) .from_until_old_format(before, now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0), - MetricsKey::resident_size(tenant_id).at_old_format(now, 0), MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1), ] } @@ -312,13 +307,12 @@ pub(crate) const fn metric_examples( timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [NewRawMetric; 6] { +) -> [NewRawMetric; 5] { [ MetricsKey::written_size(tenant_id, timeline_id).at(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0), MetricsKey::remote_storage_size(tenant_id).at(now, 0), - MetricsKey::resident_size(tenant_id).at(now, 0), MetricsKey::synthetic_size(tenant_id).at(now, 1), ] } diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 59e0145a5b..19c5aec5b3 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -521,10 +521,6 @@ mod tests { line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#, ), - ( - line!(), - r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#, - ), ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#, @@ -564,7 +560,7 @@ mod tests { assert_eq!(upgraded_samples, new_samples); } - fn metric_samples_old() -> [RawMetric; 6] { + fn metric_samples_old() -> [RawMetric; 5] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); @@ -576,7 +572,7 @@ mod tests { super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before) } - fn metric_samples() -> [NewRawMetric; 6] { + fn metric_samples() -> [NewRawMetric; 5] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index acec0ba44a..ffde08a73f 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -506,7 +506,6 @@ class SyntheticSizeVerifier: PER_METRIC_VERIFIERS = { "remote_storage_size": CannotVerifyAnything, - "resident_size": CannotVerifyAnything, "written_size": WrittenDataVerifier, "written_data_bytes_delta": WrittenDataDeltaVerifier, "timeline_logical_size": CannotVerifyAnything, From b48404952d9658f24f88441b43fb4df7d222b159 Mon Sep 17 00:00:00 2001 From: Em Sharnoff Date: Wed, 30 Apr 2025 04:32:25 -0700 Subject: [PATCH 05/40] Bump vm-builder: v0.42.2 -> v0.46.0 (#11782) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumped to pick up the changes from neondatabase/autoscaling#1366 — specifically including `uname` in the logs. Other changes included: * neondatabase/autoscaling#1301 * neondatabase/autoscaling#1296 --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6c025ad2a9..18bec1b461 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -824,7 +824,7 @@ jobs: - pg: v17 debian: bookworm env: - VM_BUILDER_VERSION: v0.42.2 + VM_BUILDER_VERSION: v0.46.0 steps: - name: Harden the runner (Audit all outbound calls) From 8da4ec9740c8f292170babecfdc07148a60cd9f9 Mon Sep 17 00:00:00 2001 From: Mikhail Kot Date: Wed, 30 Apr 2025 13:01:41 +0100 Subject: [PATCH 06/40] Postgres metrics for stuck getpage requests (#11710) https://github.com/neondatabase/neon/issues/10327 Resolves: #11720 New metrics: - `compute_getpage_stuck_requests_total` - `compute_getpage_max_inflight_stuck_time_ms` --- compute/etc/neon_collector.jsonnet | 2 ++ ...pute_getpage_max_inflight_stuck_time_ms.libsonnet | 9 +++++++++ .../compute_getpage_stuck_requests_total.libsonnet | 9 +++++++++ compute/etc/sql_exporter/neon_perf_counters.sql | 2 ++ pgxn/neon/libpagestore.c | 6 ++++++ pgxn/neon/neon_perf_counters.c | 9 ++++++++- pgxn/neon/neon_perf_counters.h | 12 ++++++++++++ 7 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet create mode 100644 compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index 449e1199d0..e64d907fe4 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -23,6 +23,8 @@ import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', import 'sql_exporter/getpage_prefetches_buffered.libsonnet', import 'sql_exporter/getpage_sync_requests_total.libsonnet', + import 'sql_exporter/compute_getpage_stuck_requests_total.libsonnet', + import 'sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet', import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', import 'sql_exporter/getpage_wait_seconds_count.libsonnet', import 'sql_exporter/getpage_wait_seconds_sum.libsonnet', diff --git a/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet new file mode 100644 index 0000000000..bc1100c832 --- /dev/null +++ b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'compute_getpage_max_inflight_stuck_time_ms', + type: 'gauge', + help: 'Max wait time for stuck requests among all backends. Includes only active stuck requests, terminated or disconnected ones are not accounted for', + values: [ + 'compute_getpage_max_inflight_stuck_time_ms', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet new file mode 100644 index 0000000000..5f72f43254 --- /dev/null +++ b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'compute_getpage_stuck_requests_total', + type: 'counter', + help: 'Total number of Getpage requests left without an answer for more than pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout', + values: [ + 'compute_getpage_stuck_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql index 4a36f3bf2f..39a9d03412 100644 --- a/compute/etc/sql_exporter/neon_perf_counters.sql +++ b/compute/etc/sql_exporter/neon_perf_counters.sql @@ -9,6 +9,8 @@ SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( getpage_wait_seconds_sum numeric, getpage_prefetch_requests_total numeric, getpage_sync_requests_total numeric, + compute_getpage_stuck_requests_total numeric, + compute_getpage_max_inflight_stuck_time_ms numeric, getpage_prefetch_misses_total numeric, getpage_prefetch_discards_total numeric, getpage_prefetches_buffered numeric, diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 5287c12a84..e758841beb 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -877,6 +877,7 @@ retry: int port; int sndbuf; int recvbuf; + uint64* max_wait; get_local_port(PQsocket(pageserver_conn), &port); get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf); @@ -887,7 +888,10 @@ retry: shard->nrequests_sent, shard->nresponses_received, port, sndbuf, recvbuf, pageserver_conn->inStart, pageserver_conn->inEnd); shard->receive_last_log_time = now; + MyNeonCounters->compute_getpage_stuck_requests_total += !shard->receive_logged; shard->receive_logged = true; + max_wait = &MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms; + *max_wait = Max(*max_wait, INSTR_TIME_GET_MILLISEC(since_start)); } /* @@ -910,6 +914,7 @@ retry: get_local_port(PQsocket(pageserver_conn), &port); neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting (socket port=%d)", INSTR_TIME_GET_DOUBLE(since_start), port); + MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0; pageserver_disconnect(shard_no); return -1; } @@ -933,6 +938,7 @@ retry: INSTR_TIME_SET_ZERO(shard->receive_start_time); INSTR_TIME_SET_ZERO(shard->receive_last_log_time); shard->receive_logged = false; + MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0; return ret; } diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index 05db187076..c77d99d636 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -148,7 +148,7 @@ histogram_to_metrics(IOHistogram histogram, static metric_t * neon_perf_counters_to_metrics(neon_per_backend_counters *counters) { -#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10) +#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12) metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); int i = 0; @@ -166,6 +166,8 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters) APPEND_METRIC(getpage_prefetch_requests_total); APPEND_METRIC(getpage_sync_requests_total); + APPEND_METRIC(compute_getpage_stuck_requests_total); + APPEND_METRIC(compute_getpage_max_inflight_stuck_time_ms); APPEND_METRIC(getpage_prefetch_misses_total); APPEND_METRIC(getpage_prefetch_discards_total); APPEND_METRIC(pageserver_requests_sent_total); @@ -294,6 +296,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) totals.file_cache_hits_total += counters->file_cache_hits_total; histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); + + totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total; + totals.compute_getpage_max_inflight_stuck_time_ms = Max( + totals.compute_getpage_max_inflight_stuck_time_ms, + counters->compute_getpage_max_inflight_stuck_time_ms); } metrics = neon_perf_counters_to_metrics(&totals); diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 5f5330bb69..10cf094d4a 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -57,6 +57,18 @@ typedef struct uint64 getpage_prefetch_requests_total; uint64 getpage_sync_requests_total; + /* + * Total number of Getpage requests left without an answer for more than + * pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout + */ + uint64 compute_getpage_stuck_requests_total; + + /* + * Longest waiting time for active stuck requests. If a stuck request gets a + * response or disconnects, this metric is updated + */ + uint64 compute_getpage_max_inflight_stuck_time_ms; + /* * Total number of readahead misses; consisting of either prefetches that * don't satisfy the LSN bounds, or cases where no readahead was issued From 60f63c076f9b6b362600d65e01f3f1f8a0f4a5dd Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 30 Apr 2025 15:23:20 +0300 Subject: [PATCH 07/40] Make safekeeper proto version 3 default (#11518) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem We have been running compute <-> sk protocol version 3 for a while on staging with no issues observed, and want to fully migrate to it eventually. ## Summary of changes Let's make v3 the default. ref https://github.com/neondatabase/neon/issues/10326 --------- Co-authored-by: Arpad Müller --- pgxn/neon/walproposer_pg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index a061639815..17582405db 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -63,7 +63,7 @@ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; -int safekeeper_proto_version = 2; +int safekeeper_proto_version = 3; /* Set to true in the walproposer bgw. */ static bool am_walproposer; @@ -228,7 +228,7 @@ nwp_register_gucs(void) "Version of compute <-> safekeeper protocol.", "Used while migrating from 2 to 3.", &safekeeper_proto_version, - 2, 0, INT_MAX, + 3, 0, INT_MAX, PGC_POSTMASTER, 0, NULL, NULL, NULL); From 1d68577fbd3c08496dbe7dd716dd5562bc51ca7a Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 30 Apr 2025 15:44:59 +0300 Subject: [PATCH 08/40] Check target slot state in prefetch_wait_for (#11779) ## Problem See https://neondb.slack.com/archives/C04DGM6SMTM/p1745599814030679 Assume the following scenario: prefetch_wait_for is doing `CHECK_FOR_INTERRUPTS` which tries to load prefetch responses. In case of error is calls pageserver_disconnect which aborts all in-flight requests. But such failure is not detected by `prefetch_wait_for` which returns true. As a result `communicator_read_at_lsnv` assumes that slot is received, but as far as asserts are disables at prod, it is not actually checked. Then it tries to interpret response and ... *SIGSEGV* ## Summary of changes Check target slot state in `prefetch_wait_for`. Resolves https://github.com/neondatabase/cloud/issues/28258 Co-authored-by: Konstantin Knizhnik --- pgxn/neon/communicator.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index 61bb3206e7..818a149499 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -687,8 +687,14 @@ prefetch_wait_for(uint64 ring_index) END_PREFETCH_RECEIVE_WORK(); CHECK_FOR_INTERRUPTS(); } - - return result; + if (result) + { + /* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */ + PrefetchRequest *slot = GetPrfSlot(ring_index); + return slot->status == PRFS_RECEIVED; + } + return false; +; } /* From 6b4b8e0d8be55c7224cbf113fb1717179b53f0e9 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 30 Apr 2025 11:50:12 -0400 Subject: [PATCH 09/40] fix(pageserver): do not increase basebackup err counter when shutdown (#11778) ## Problem We occasionally see basebackup errors alerts but there were no errors logged. Looking at the code, the only codepath that will cause this is shutting down. ## Summary of changes Do not increase any counter (ok/err) when basebackup request gets cancelled due to shutdowns. Signed-off-by: Alex Chi Z --- pageserver/src/metrics.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9a6c3f2378..a68b6acca1 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2180,6 +2180,10 @@ impl BasebackupQueryTimeOngoingRecording<'_> { // If you want to change categorize of a specific error, also change it in `log_query_error`. let metric = match res { Ok(_) => &self.parent.ok, + Err(QueryError::Shutdown) => { + // Do not observe ok/err for shutdown + return; + } Err(QueryError::Disconnected(ConnectionError::Io(io_error))) if is_expected_io_error(io_error) => { From e2db76b9be5fc0de8f953dc4ba9f039ce05cdd95 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 30 Apr 2025 12:04:00 -0400 Subject: [PATCH 10/40] feat(pageserver): ondemand download reason observability (#11780) ## Problem Part of https://github.com/neondatabase/neon/issues/11615 ## Summary of changes We don't understand the root cause of why we get resident size surge every now and then. This patch adds observability for that, and in the next week, we might have a better understanding of what's going on. --------- Signed-off-by: Alex Chi Z --- pageserver/src/metrics.rs | 18 ++++++++++++++++++ pageserver/src/tenant/storage_layer/layer.rs | 9 +++++++++ 2 files changed, 27 insertions(+) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index a68b6acca1..8e4dbd6c3e 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -497,6 +497,24 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::n .expect("failed to define a metric") }); +pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_ondemand_download_bytes_total", + "Total bytes of layers on-demand downloaded", + &["task_kind"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_ondemand_download_count", + "Total count of layers on-demand downloaded", + &["task_kind"] + ) + .expect("failed to define a metric") +}); + pub(crate) mod wait_ondemand_download_time { use super::*; const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[ diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index b7f6e5dc77..50810cb154 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -4,6 +4,7 @@ use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime}; use crate::PERF_TRACE_TARGET; +use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT}; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; @@ -1255,6 +1256,14 @@ impl LayerInner { self.access_stats.record_residence_event(); + let task_kind: &'static str = ctx.task_kind().into(); + ONDEMAND_DOWNLOAD_BYTES + .with_label_values(&[task_kind]) + .inc_by(self.desc.file_size); + ONDEMAND_DOWNLOAD_COUNT + .with_label_values(&[task_kind]) + .inc(); + Ok(self.initialize_after_layer_is_on_disk(permit)) } Err(e) => { From bec7427d9e4d84b6bcb6a74338cfb711e1748e8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 30 Apr 2025 18:24:01 +0200 Subject: [PATCH 11/40] pull_timeline and sk logging fixes (#11786) This patch contains some fixes of issues I ran into for #11712: * make `pull_timeline` return success for timeline that already exists. This follows general API design of storage components: API endpoints are retryable and converge to a status code, instead of starting to error. We change the `pull_timeline`'s return type a little bit, because we might not actually have a source sk to pull from. Note that the fix is not enough, there is still a race when two `pull_timeline` instances happen in parallel: we might try to enter both pulled timelines at the same time. That can be fixed later. * make `pull_timeline` support one safekeeper being down. In general, if one safekeeper is down, that's not a problem. the added comment explains a potential situation (found in the `test_lagging_sk` test for example) * don't log very long errors when computes try to connect to safekeepers that don't have the timeline yet, if `allow_timeline_creation` is false. That flag is enabled when a sk connection string with generation numbers is passed to the compute, so we'll hit this code path more often. E.g. when a safekeeper missed a timeline creation, but the compute connects to it first before the `pull_timeline` gets requested by the storcon reconciler: this is a perfectly normal situation. So don't log the whole error backtrace, and don't log it on the error log level, but only on info. part of #11670 --- libs/postgres_backend/src/lib.rs | 6 ++++ libs/safekeeper_api/src/models.rs | 5 ++-- safekeeper/src/pull_timeline.rs | 28 ++++++++++++++++--- safekeeper/src/receive_wal.rs | 13 ++++++--- .../src/service/safekeeper_reconciler.rs | 9 +++--- 5 files changed, 47 insertions(+), 14 deletions(-) diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 654dde8da6..714d8ac403 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -841,6 +841,10 @@ impl PostgresBackend { let expected_end = match &end { ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true, + // The timeline doesn't exist and we have been requested to not auto-create it. + // Compute requests for timelines that haven't been created yet + // might reach us before the storcon request to create those timelines. + TimelineNoCreate => true, CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error)) if is_expected_io_error(io_error) => { @@ -1059,6 +1063,8 @@ pub enum CopyStreamHandlerEnd { Terminate, #[error("EOF on COPY stream")] EOF, + #[error("timeline not found, and allow_timeline_creation is false")] + TimelineNoCreate, /// The connection was lost #[error("connection error: {0}")] Disconnected(#[from] ConnectionError), diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 51f88625da..cc31b38fe7 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -303,7 +303,8 @@ pub struct PullTimelineRequest { #[derive(Debug, Serialize, Deserialize)] pub struct PullTimelineResponse { - // Donor safekeeper host - pub safekeeper_host: String, + /// Donor safekeeper host. + /// None if no pull happened because the timeline already exists. + pub safekeeper_host: Option, // TODO: add more fields? } diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 653b084ad8..1510a51019 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -401,7 +401,10 @@ pub async fn handle_request( request.timeline_id, )); if existing_tli.is_ok() { - bail!("Timeline {} already exists", request.timeline_id); + info!("Timeline {} already exists", request.timeline_id); + return Ok(PullTimelineResponse { + safekeeper_host: None, + }); } let mut http_client = reqwest::Client::builder(); @@ -425,8 +428,25 @@ pub async fn handle_request( let mut statuses = Vec::new(); for (i, response) in responses.into_iter().enumerate() { - let status = response.context(format!("fetching status from {}", http_hosts[i]))?; - statuses.push((status, i)); + match response { + Ok(status) => { + statuses.push((status, i)); + } + Err(e) => { + info!("error fetching status from {}: {e}", http_hosts[i]); + } + } + } + + // Allow missing responses from up to one safekeeper (say due to downtime) + // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes + // offline and C comes online. Then we want a pull on C with A and B as hosts to work. + let min_required_successful = (http_hosts.len() - 1).max(1); + if statuses.len() < min_required_successful { + bail!( + "only got {} successful status responses. required: {min_required_successful}", + statuses.len() + ) } // Find the most advanced safekeeper @@ -536,6 +556,6 @@ async fn pull_timeline( .await?; Ok(PullTimelineResponse { - safekeeper_host: host, + safekeeper_host: Some(host), }) } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 9975153f6c..eb8eee6ab8 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -32,7 +32,7 @@ use crate::metrics::{ WAL_RECEIVERS, }; use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; -use crate::timeline::WalResidentTimeline; +use crate::timeline::{TimelineError, WalResidentTimeline}; const DEFAULT_FEEDBACK_CAPACITY: usize = 8; @@ -357,9 +357,14 @@ impl NetworkReader<'_, IO> { .await .context("create timeline")? } else { - self.global_timelines - .get(self.ttid) - .context("get timeline")? + let timeline_res = self.global_timelines.get(self.ttid); + match timeline_res { + Ok(tl) => tl, + Err(TimelineError::NotFound(_)) => { + return Err(CopyStreamHandlerEnd::TimelineNoCreate); + } + other => other.context("get_timeline")?, + } }; tli.wal_residence_guard().await? } diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index 74308cabff..71c73a0112 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -306,10 +306,11 @@ impl SafekeeperReconcilerInner { req, async |client| client.pull_timeline(&pull_req).await, |resp| { - tracing::info!( - "pulled timeline from {} onto {req_host}", - resp.safekeeper_host, - ); + if let Some(host) = resp.safekeeper_host { + tracing::info!("pulled timeline from {host} onto {req_host}"); + } else { + tracing::info!("timeline already present on safekeeper on {req_host}"); + } }, req_cancel, ) From 1b789e8d7c917898d4069eda8c999f96c5ac0eeb Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Wed, 30 Apr 2025 20:50:21 +0400 Subject: [PATCH 12/40] fix(pgxn/neon): Use proper member size in TermsCollectedMset and VotesCollectedMset (#11785) ## Problem `TermsCollectedMset` and `VotesCollectedMset` accept a MemberSet argument to find a quorum in. It may be either `wp->mconf.members` or `wp->mconf.new_members`. But the loops inside always use `wp->mconf.members.len`. If the sizes of member sets are different, it may lead to these functions not scanning all the safekeepers from `mset`. We are not planning to change the member set size dynamically now, but it's worth fixing anyway. - Part of https://github.com/neondatabase/neon/issues/11669 ## Summary of changes - Use proper size of member set in `TermsCollectedMset` and `VotesCollectedMset` --- pgxn/neon/walproposer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index b95b1451e4..f4f1398375 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -836,7 +836,7 @@ TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf { uint32 n_greeted = 0; - for (uint32 i = 0; i < wp->mconf.members.len; i++) + for (uint32 i = 0; i < mset->len; i++) { Safekeeper *sk = msk[i]; @@ -1106,7 +1106,7 @@ VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf { uint32 n_votes = 0; - for (uint32 i = 0; i < wp->mconf.members.len; i++) + for (uint32 i = 0; i < mset->len; i++) { Safekeeper *sk = msk[i]; From 5bd850d15a3aa1034c580521b15b16ab71d961a0 Mon Sep 17 00:00:00 2001 From: Shockingly Good Date: Thu, 1 May 2025 11:09:10 +0200 Subject: [PATCH 13/40] Fix the leaked tracing context for the "compute_monitor:run". (#11791) Removes the leaked tracing context for the "compute_monitor:run" log, which either inherited the "start_compute" span or also the HTTP request context. ## Problem The problem is that the context of the monitor's trace is unnecessarily populated with the span data inherited from previously within the same thread. ## Summary of changes The context is completely reset by moving the span from the thread spawning the monitor into the thread where the monitor will actually start working. Addresses https://github.com/neondatabase/cloud/issues/28145 ## Examples ### Before ``` 2025-04-30T16:39:05.840298Z INFO start_compute:compute_monitor:run: compute is not running, waiting before monitoring activity ``` ### After ``` 2025-04-30T16:39:05.840298Z INFO compute_monitor:run: compute is not running, waiting before monitoring activity ``` --- compute_tools/src/monitor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 5a07eec833..3311ee47b3 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -424,10 +424,10 @@ pub fn launch_monitor(compute: &Arc) -> thread::JoinHandle<()> { experimental, }; - let span = span!(Level::INFO, "compute_monitor"); thread::Builder::new() .name("compute-monitor".into()) .spawn(move || { + let span = span!(Level::INFO, "compute_monitor"); let _enter = span.enter(); monitor.run(); }) From f999632327f108fc497d4c3f467cdf4349f4027a Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Thu, 1 May 2025 11:22:01 -0400 Subject: [PATCH 14/40] Adding `anon` v2 support to the dockerfile (#11313) ## Problem Removed `anon` v1 support as described here: https://github.com/neondatabase/cloud/issues/22663 Adding `anon` v2 support to re-introduce the `pg_anon` extension. Related Issues: https://github.com/neondatabase/cloud/issues/20456 ## Summary of changes Adding `anon` v2 support by building it in the dockerfile --- compute/compute-node.Dockerfile | 51 +++++++++++++ compute/patches/anon_v2.patch | 129 ++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 compute/patches/anon_v2.patch diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 267940a405..cc338cec6a 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1096,6 +1096,23 @@ RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root +######################################################################################### +# +# Layer "rust extensions pgrx14" +# +# Version 14 is now required by a few +# This layer should be used as a base for new pgrx extensions, +# and eventually get merged with `rust-extensions-build` +# +######################################################################################### +FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14 +ARG PG_VERSION + +RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ + /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' + +USER root + ######################################################################################### # # Layers "pg-onnx-build" and "pgrag-build" @@ -1330,6 +1347,39 @@ COPY --from=pg_session_jwt-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_session_jwt-src RUN cargo pgrx install --release +######################################################################################### +# +# Layer "pg-anon-pg-build" +# compile anon extension +# +######################################################################################### +FROM pg-build AS pg_anon-src +ARG PG_VERSION +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +WORKDIR /ext-src +COPY compute/patches/anon_v2.patch . + +# This is an experimental extension, never got to real production. +# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. +ENV PATH="/usr/local/pgsql/bin/:$PATH" +RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/latest/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \ + mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ + find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + patch -p1 < /ext-src/anon_v2.patch + +FROM rust-extensions-build-pgrx14 AS pg-anon-pg-build +ARG PG_VERSION +COPY --from=pg_anon-src /ext-src/ /ext-src/ +WORKDIR /ext-src +RUN cd pg_anon-src && \ + make -j $(getconf _NPROCESSORS_ONLN) extension PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \ + chmod -R a+r ../pg_anon-src && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; + +######################################################################################## + ######################################################################################### # # Layer "wal2json-build" @@ -1626,6 +1676,7 @@ COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql +COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ diff --git a/compute/patches/anon_v2.patch b/compute/patches/anon_v2.patch new file mode 100644 index 0000000000..e833a6dfd3 --- /dev/null +++ b/compute/patches/anon_v2.patch @@ -0,0 +1,129 @@ +diff --git a/sql/anon.sql b/sql/anon.sql +index 0cdc769..f6cc950 100644 +--- a/sql/anon.sql ++++ b/sql/anon.sql +@@ -1141,3 +1141,8 @@ $$ + -- TODO : https://en.wikipedia.org/wiki/L-diversity + + -- TODO : https://en.wikipedia.org/wiki/T-closeness ++ ++-- NEON Patches ++ ++GRANT ALL ON SCHEMA anon to neon_superuser; ++GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser; +diff --git a/sql/init.sql b/sql/init.sql +index 7da6553..9b6164b 100644 +--- a/sql/init.sql ++++ b/sql/init.sql +@@ -74,50 +74,49 @@ $$ + + SECURITY LABEL FOR anon ON FUNCTION anon.load_csv IS 'UNTRUSTED'; + +--- load fake data from a given path +-CREATE OR REPLACE FUNCTION anon.init( +- datapath TEXT +-) ++CREATE OR REPLACE FUNCTION anon.load_fake_data() + RETURNS BOOLEAN + AS $$ + DECLARE +- datapath_check TEXT; + success BOOLEAN; ++ sharedir TEXT; ++ datapath TEXT; + BEGIN + +- IF anon.is_initialized() THEN +- RAISE NOTICE 'The anon extension is already initialized.'; +- RETURN TRUE; +- END IF; ++ datapath := '/extension/anon/'; ++ -- find the local extension directory ++ SELECT setting INTO sharedir ++ FROM pg_catalog.pg_config ++ WHERE name = 'SHAREDIR'; + + SELECT bool_or(results) INTO success + FROM unnest(array[ +- anon.load_csv('anon.identifiers_category',datapath||'/identifiers_category.csv'), +- anon.load_csv('anon.identifier',datapath ||'/identifier.csv'), +- anon.load_csv('anon.address',datapath ||'/address.csv'), +- anon.load_csv('anon.city',datapath ||'/city.csv'), +- anon.load_csv('anon.company',datapath ||'/company.csv'), +- anon.load_csv('anon.country',datapath ||'/country.csv'), +- anon.load_csv('anon.email', datapath ||'/email.csv'), +- anon.load_csv('anon.first_name',datapath ||'/first_name.csv'), +- anon.load_csv('anon.iban',datapath ||'/iban.csv'), +- anon.load_csv('anon.last_name',datapath ||'/last_name.csv'), +- anon.load_csv('anon.postcode',datapath ||'/postcode.csv'), +- anon.load_csv('anon.siret',datapath ||'/siret.csv'), +- anon.load_csv('anon.lorem_ipsum',datapath ||'/lorem_ipsum.csv') ++ anon.load_csv('anon.identifiers_category',sharedir || datapath || '/identifiers_category.csv'), ++ anon.load_csv('anon.identifier',sharedir || datapath || '/identifier.csv'), ++ anon.load_csv('anon.address',sharedir || datapath || '/address.csv'), ++ anon.load_csv('anon.city',sharedir || datapath || '/city.csv'), ++ anon.load_csv('anon.company',sharedir || datapath || '/company.csv'), ++ anon.load_csv('anon.country',sharedir || datapath || '/country.csv'), ++ anon.load_csv('anon.email', sharedir || datapath || '/email.csv'), ++ anon.load_csv('anon.first_name',sharedir || datapath || '/first_name.csv'), ++ anon.load_csv('anon.iban',sharedir || datapath || '/iban.csv'), ++ anon.load_csv('anon.last_name',sharedir || datapath || '/last_name.csv'), ++ anon.load_csv('anon.postcode',sharedir || datapath || '/postcode.csv'), ++ anon.load_csv('anon.siret',sharedir || datapath || '/siret.csv'), ++ anon.load_csv('anon.lorem_ipsum',sharedir || datapath || '/lorem_ipsum.csv') + ]) results; + RETURN success; +- + END; + $$ +- LANGUAGE PLPGSQL ++ LANGUAGE plpgsql + VOLATILE + RETURNS NULL ON NULL INPUT +- PARALLEL UNSAFE -- because load_csv is unsafe +- SECURITY INVOKER ++ PARALLEL UNSAFE -- because of the EXCEPTION ++ SECURITY DEFINER + SET search_path='' + ; +-SECURITY LABEL FOR anon ON FUNCTION anon.init(TEXT) IS 'UNTRUSTED'; ++ ++SECURITY LABEL FOR anon ON FUNCTION anon.load_fake_data IS 'UNTRUSTED'; + + -- People tend to forget the anon.init() step + -- This is a friendly notice for them +@@ -144,7 +143,7 @@ SECURITY LABEL FOR anon ON FUNCTION anon.notice_if_not_init IS 'UNTRUSTED'; + CREATE OR REPLACE FUNCTION anon.load(TEXT) + RETURNS BOOLEAN AS + $$ +- SELECT anon.init($1); ++ SELECT anon.init(); + $$ + LANGUAGE SQL + VOLATILE +@@ -159,16 +158,16 @@ SECURITY LABEL FOR anon ON FUNCTION anon.load(TEXT) IS 'UNTRUSTED'; + CREATE OR REPLACE FUNCTION anon.init() + RETURNS BOOLEAN + AS $$ +- WITH conf AS ( +- -- find the local extension directory +- SELECT setting AS sharedir +- FROM pg_catalog.pg_config +- WHERE name = 'SHAREDIR' +- ) +- SELECT anon.init(conf.sharedir || '/extension/anon/') +- FROM conf; ++BEGIN ++ IF anon.is_initialized() THEN ++ RAISE NOTICE 'The anon extension is already initialized.'; ++ RETURN TRUE; ++ END IF; ++ ++ RETURN anon.load_fake_data(); ++END; + $$ +- LANGUAGE SQL ++ LANGUAGE plpgsql + VOLATILE + PARALLEL UNSAFE -- because init is unsafe + SECURITY INVOKER From 16d594b7b37733fd56d3cc4767b119f1dd391fb7 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 1 May 2025 16:56:43 +0100 Subject: [PATCH 15/40] pagectl: list layers for given key in decreasing LSN order (#11799) Adds an extra key CLI arg to `pagectl layer list-layer`. When provided, only layers with key ranges containing the key will be listed in decreasing LSN order (indices are preserved for `dump-layer`). --- pageserver/ctl/src/layers.rs | 37 +++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 293c01eff0..79f56a5a51 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -10,6 +10,7 @@ use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, ima use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::virtual_file::api::IoMode; use pageserver::{page_cache, virtual_file}; +use pageserver_api::key::Key; use utils::id::{TenantId, TimelineId}; use crate::layer_map_analyzer::parse_filename; @@ -27,6 +28,7 @@ pub(crate) enum LayerCmd { path: PathBuf, tenant: String, timeline: String, + key: Option, }, /// Dump all information of a layer file DumpLayer { @@ -100,6 +102,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { path, tenant, timeline, + key, } => { let timeline_path = path .join(TENANTS_SEGMENT_NAME) @@ -107,21 +110,37 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { .join(TIMELINES_SEGMENT_NAME) .join(timeline); let mut idx = 0; + let mut to_print = Vec::default(); for layer in fs::read_dir(timeline_path)? { let layer = layer?; if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) { - println!( - "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", - idx, - layer_file.key_range.start, - layer_file.key_range.end, - layer_file.lsn_range.start, - layer_file.lsn_range.end, - layer_file.is_delta, - ); + if let Some(key) = key { + if layer_file.key_range.start <= *key && *key < layer_file.key_range.end { + to_print.push((idx, layer_file)); + } + } else { + to_print.push((idx, layer_file)); + } idx += 1; } } + + if key.is_some() { + to_print + .sort_by_key(|(_idx, layer_file)| std::cmp::Reverse(layer_file.lsn_range.end)); + } + + for (idx, layer_file) in to_print { + println!( + "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", + idx, + layer_file.key_range.start, + layer_file.key_range.end, + layer_file.lsn_range.start, + layer_file.lsn_range.end, + layer_file.is_delta, + ); + } Ok(()) } LayerCmd::DumpLayer { From ae2c3ac12ff1b21f92a0d81b8988ff255c3dd8cf Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 1 May 2025 13:51:10 -0400 Subject: [PATCH 16/40] test: revert relsizev2 config (#11759) ## Problem part of https://github.com/neondatabase/neon/issues/9516 One thing I realized in the past few months is that "no-way-back" things like this are scary to roll out without a fine-grained rollout infra. The plan was to flip the flag in the repo and roll it out soon, but I don't think rolling out would happen in the near future. So I'd rather revert the flag to avoid creating a discrepancy between staging and the regress tests. ## Summary of changes Not using rel_size_v2 by default in unit tests; we still have a few tests to explicitly test the new format so we still get some test coverages. --------- Signed-off-by: Alex Chi Z --- test_runner/fixtures/neon_fixtures.py | 3 ++- test_runner/regress/test_attach_tenant_config.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b93df4ede4..47d1228c61 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1279,7 +1279,8 @@ class NeonEnv: ) tenant_config = ps_cfg.setdefault("tenant_config", {}) - tenant_config["rel_size_v2_enabled"] = True # Enable relsize_v2 by default in tests + # This feature is pending rollout. + # tenant_config["rel_size_v2_enabled"] = True if self.pageserver_remote_storage is not None: ps_cfg["remote_storage"] = remote_storage_to_toml_dict( diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index ee408e3c65..3616467c00 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -186,7 +186,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "type": "interpreted", "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, }, - "rel_size_v2_enabled": False, # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it + "rel_size_v2_enabled": True, "gc_compaction_enabled": True, "gc_compaction_verification": False, "gc_compaction_initial_threshold_kb": 1024000, From bbc35e10b877f7bbb260595c90375f4e5a3bfbdb Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 1 May 2025 14:36:26 -0400 Subject: [PATCH 17/40] fix(test): increase timeouts for some tests (#11781) ## Problem Those tests are timing out more frequently after https://github.com/neondatabase/neon/pull/11585 ## Summary of changes Increase timeout for `test_pageserver_gc_compaction_smoke` Increase rollback wait timeout for `test_tx_abort_with_many_relations` Signed-off-by: Alex Chi Z --- test_runner/regress/test_compaction.py | 2 +- test_runner/regress/test_pg_regress.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 53edf9f79e..0dfc665a1d 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -229,7 +229,7 @@ def test_pageserver_gc_compaction_preempt( @skip_in_debug_build("only run with release build") -@pytest.mark.timeout(600) # This test is slow with sanitizers enabled, especially on ARM +@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM @pytest.mark.parametrize( "with_branches", ["with_branches", "no_branches"], diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 0fea706888..474002353b 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -471,7 +471,7 @@ def test_tx_abort_with_many_relations( try: # Rollback phase should be fast: this is one WAL record that we should process efficiently fut = exec.submit(rollback_and_wait) - fut.result(timeout=15) + fut.result(timeout=15 if reldir_type == "v1" else 30) except: exec.shutdown(wait=False, cancel_futures=True) raise From 22290eb7ba14b1bd6efd0f6999f6b292684207a0 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 2 May 2025 13:46:21 +0100 Subject: [PATCH 18/40] CI: notify relevant team about release deploy failures (#11797) ## Problem We notify only Storage team about failed deploys, but Compute and Proxy teams can also benefit from that ## Summary of changes - Adjust `notify-storage-release-deploy-failure` to notify the relevant team about failed deploy --- .github/actionlint.yml | 5 ++++ .github/workflows/build_and_test.yml | 37 +++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 1d1b50e458..b7e0be761a 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -33,9 +33,14 @@ config-variables: - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_CICD_CHANNEL_ID + - SLACK_COMPUTE_CHANNEL_ID - SLACK_ON_CALL_DEVPROD_STREAM - SLACK_ON_CALL_QA_STAGING_STREAM - SLACK_ON_CALL_STORAGE_STAGING_STREAM + - SLACK_ONCALL_COMPUTE_GROUP + - SLACK_ONCALL_PROXY_GROUP + - SLACK_ONCALL_STORAGE_GROUP + - SLACK_PROXY_CHANNEL_ID - SLACK_RUST_CHANNEL_ID - SLACK_STORAGE_CHANNEL_ID - SLACK_UPCOMING_RELEASE_CHANNEL_ID diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 18bec1b461..e0995218f9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1434,10 +1434,10 @@ jobs: ;; esac - notify-storage-release-deploy-failure: - needs: [ deploy ] + notify-release-deploy-failure: + needs: [ meta, deploy ] # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs. - if: github.ref_name == 'release' && needs.deploy.result != 'success' && always() + if: contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) && needs.deploy.result != 'success' && always() runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) @@ -1445,15 +1445,40 @@ jobs: with: egress-policy: audit - - name: Post release-deploy failure to team-storage slack channel + - name: Post release-deploy failure to team slack channel uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + env: + TEAM_ONCALL: >- + ${{ + fromJSON(format('{ + "storage-release": "", + "compute-release": "", + "proxy-release": "" + }', + vars.SLACK_ONCALL_STORAGE_GROUP, + vars.SLACK_ONCALL_COMPUTE_GROUP, + vars.SLACK_ONCALL_PROXY_GROUP + ))[needs.meta.outputs.run-kind] + }} + CHANNEL: >- + ${{ + fromJSON(format('{ + "storage-release": "{0}", + "compute-release": "{1}", + "proxy-release": "{2}" + }', + vars.SLACK_STORAGE_CHANNEL_ID, + vars.SLACK_COMPUTE_CHANNEL_ID, + vars.SLACK_PROXY_CHANNEL_ID + ))[needs.meta.outputs.run-kind] + }} with: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} payload: | - channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }} + channel: ${{ env.CHANNEL }} text: | - 🔴 : deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. + 🔴 ${{ env.TEAM_ONCALL }}: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: From 79699aebc8ac886afea2ec45320d7129232007cb Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 2 May 2025 17:36:10 +0300 Subject: [PATCH 19/40] Reserve in file descriptor pool sockets used for connections to page servers (#11798) ## Problem See https://github.com/neondatabase/neon/issues/11790 The neon extension opens extensions to the pageservers, which consumes file descriptors. Postgres has a mechanism to count how many FDs are in use, but it doesn't know about those FDs. We should call ReserveExternalFD() or AcquireExternalFD() to account for them. ## Summary of changes Call `ReserveExternalFD()` for each shard --------- Co-authored-by: Konstantin Knizhnik Co-authored-by: Mikhail Kot --- pgxn/neon/libpagestore.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index e758841beb..ee4e6ccc5b 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -26,6 +26,7 @@ #include "portability/instr_time.h" #include "postmaster/interrupt.h" #include "storage/buf_internals.h" +#include "storage/fd.h" #include "storage/ipc.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" @@ -79,6 +80,7 @@ int neon_protocol_version = 3; static int neon_compute_mode = 0; static int max_reconnect_attempts = 60; static int stripe_size; +static int max_sockets; static int pageserver_response_log_timeout = 10000; /* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */ @@ -336,6 +338,13 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p) pageserver_disconnect(i); } pagestore_local_counter = end_update_counter; + + /* Reserve file descriptors for sockets */ + while (max_sockets < num_shards) + { + max_sockets += 1; + ReserveExternalFD(); + } } if (num_shards_p) From 4b9087651c2ef0d7888d76e8786066d00381388f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 2 May 2025 22:27:59 +0300 Subject: [PATCH 20/40] Checked that stored LwLSN >= FirstNormalUnloggedLSN (#11750) ## Problem Undo unintended change 60b9fb1baf4cba732cff4792b9a97d755794b7e2 ## Summary of changes Add assert that we are not storing fake LSN in LwLSN. --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/neon_lwlsncache.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/neon_lwlsncache.c b/pgxn/neon/neon_lwlsncache.c index 6959da55cb..a8cfa0f825 100644 --- a/pgxn/neon/neon_lwlsncache.c +++ b/pgxn/neon/neon_lwlsncache.c @@ -4,6 +4,7 @@ #include "miscadmin.h" #include "access/xlog.h" +#include "access/xlog_internal.h" #include "storage/ipc.h" #include "storage/shmem.h" #include "storage/buf_internals.h" @@ -396,9 +397,10 @@ SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn, XLogRecPtr neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks) { - if (lsn < FirstNormalUnloggedLSN || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0) + if (lsn == InvalidXLogRecPtr || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0) return lsn; + Assert(lsn >= WalSegMinSize); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, from, n_blocks); LWLockRelease(LastWrittenLsnLock); @@ -435,7 +437,6 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, NInfoGetRelNumber(relfilenode) == InvalidOid) return InvalidXLogRecPtr; - BufTagInit(key, relNumber, forknum, blockno, spcOid, dbOid); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); @@ -444,6 +445,10 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, { XLogRecPtr lsn = lsns[i]; + if (lsn == InvalidXLogRecPtr) + continue; + + Assert(lsn >= WalSegMinSize); key.blockNum = blockno + i; entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); if (found) From 6131d86ec972f08febecc7b29a4273e84a408905 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 5 May 2025 12:18:55 +0100 Subject: [PATCH 21/40] proxy: allow invalid SNI (#11792) ## Problem Some PrivateLink customers are unable to use Private DNS. As such they use an invalid domain name to address Neon. We currently are rejecting those connections because we cannot resolve the correct certificate. ## Summary of changes 1. Ensure a certificate is always returned. 2. If there is an SNI field, use endpoint fallback if it doesn't match. I suggest reviewing each commit separately. --- proxy/src/auth/credentials.rs | 65 ++++----- proxy/src/proxy/handshake.rs | 15 +-- proxy/src/proxy/tests/mod.rs | 3 +- proxy/src/serverless/sql_over_http.rs | 3 +- proxy/src/tls/server_config.rs | 187 ++++++++++++++------------ 5 files changed, 138 insertions(+), 135 deletions(-) diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index c55af325e3..183976374a 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -32,12 +32,6 @@ pub(crate) enum ComputeUserInfoParseError { option: EndpointId, }, - #[error( - "Common name inferred from SNI ('{}') is not known", - .cn, - )] - UnknownCommonName { cn: String }, - #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] MalformedProjectName(EndpointId), } @@ -66,22 +60,15 @@ impl ComputeUserInfoMaybeEndpoint { } } -pub(crate) fn endpoint_sni( - sni: &str, - common_names: &HashSet, -) -> Result, ComputeUserInfoParseError> { - let Some((subdomain, common_name)) = sni.split_once('.') else { - return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() }); - }; +pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet) -> Option { + let (subdomain, common_name) = sni.split_once('.')?; if !common_names.contains(common_name) { - return Err(ComputeUserInfoParseError::UnknownCommonName { - cn: common_name.into(), - }); + return None; } if subdomain == SERVERLESS_DRIVER_SNI { - return Ok(None); + return None; } - Ok(Some(EndpointId::from(subdomain))) + Some(EndpointId::from(subdomain)) } impl ComputeUserInfoMaybeEndpoint { @@ -113,15 +100,8 @@ impl ComputeUserInfoMaybeEndpoint { }) .map(|name| name.into()); - let endpoint_from_domain = if let Some(sni_str) = sni { - if let Some(cn) = common_names { - endpoint_sni(sni_str, cn)? - } else { - None - } - } else { - None - }; + let endpoint_from_domain = + sni.and_then(|sni_str| common_names.and_then(|cn| endpoint_sni(sni_str, cn))); let endpoint = match (endpoint_option, endpoint_from_domain) { // Invariant: if we have both project name variants, they should match. @@ -424,21 +404,34 @@ mod tests { } #[test] - fn parse_inconsistent_sni() { + fn parse_unknown_sni() { let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); let ctx = RequestContext::test(); - let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); - match err { - UnknownCommonName { cn } => { - assert_eq!(cn, "localhost"); - } - _ => panic!("bad error: {err:?}"), - } + let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .unwrap(); + + assert!(info.endpoint_id.is_none()); + } + + #[test] + fn parse_unknown_sni_with_options() { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("options", "endpoint=foo-bar-baz-1234"), + ]); + + let sni = Some("project.localhost"); + let common_names = Some(["example.com".into()].into()); + + let ctx = RequestContext::test(); + let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .unwrap(); + + assert_eq!(info.endpoint_id.as_deref(), Some("foo-bar-baz-1234")); } #[test] diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index c05031ad97..54c02f2c15 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -24,9 +24,6 @@ pub(crate) enum HandshakeError { #[error("protocol violation")] ProtocolViolation, - #[error("missing certificate")] - MissingCertificate, - #[error("{0}")] StreamUpgradeError(#[from] StreamUpgradeError), @@ -42,10 +39,6 @@ impl ReportableError for HandshakeError { match self { HandshakeError::EarlyData => crate::error::ErrorKind::User, HandshakeError::ProtocolViolation => crate::error::ErrorKind::User, - // This error should not happen, but will if we have no default certificate and - // the client sends no SNI extension. - // If they provide SNI then we can be sure there is a certificate that matches. - HandshakeError::MissingCertificate => crate::error::ErrorKind::Service, HandshakeError::StreamUpgradeError(upgrade) => match upgrade { StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service, StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, @@ -146,7 +139,7 @@ pub(crate) async fn handshake( // try parse endpoint let ep = conn_info .server_name() - .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten()); + .and_then(|sni| endpoint_sni(sni, &tls.common_names)); if let Some(ep) = ep { ctx.set_endpoint_id(ep); } @@ -161,10 +154,8 @@ pub(crate) async fn handshake( } } - let (_, tls_server_end_point) = tls - .cert_resolver - .resolve(conn_info.server_name()) - .ok_or(HandshakeError::MissingCertificate)?; + let (_, tls_server_end_point) = + tls.cert_resolver.resolve(conn_info.server_name()); stream = PqStream { framed: Framed { diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 9a6864c33e..f47636cd71 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -98,8 +98,7 @@ fn generate_tls_config<'a>( .with_no_client_auth() .with_single_cert(vec![cert.clone()], key.clone_key())?; - let mut cert_resolver = CertResolver::new(); - cert_resolver.add_cert(key, vec![cert], true)?; + let cert_resolver = CertResolver::new(key, vec![cert])?; let common_names = cert_resolver.get_common_names(); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 7fb39553f9..fee5942b7e 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -199,8 +199,7 @@ fn get_conn_info( let endpoint = match connection_url.host() { Some(url::Host::Domain(hostname)) => { if let Some(tls) = tls { - endpoint_sni(hostname, &tls.common_names)? - .ok_or(ConnInfoError::MalformedEndpoint)? + endpoint_sni(hostname, &tls.common_names).ok_or(ConnInfoError::MalformedEndpoint)? } else { hostname .split_once('.') diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 5a95e69fde..8f8917ef62 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -5,6 +5,7 @@ use anyhow::{Context, bail}; use itertools::Itertools; use rustls::crypto::ring::{self, sign}; use rustls::pki_types::{CertificateDer, PrivateKeyDer}; +use rustls::sign::CertifiedKey; use x509_cert::der::{Reader, SliceReader}; use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint}; @@ -25,10 +26,8 @@ pub fn configure_tls( certs_dir: Option<&String>, allow_tls_keylogfile: bool, ) -> anyhow::Result { - let mut cert_resolver = CertResolver::new(); - // add default certificate - cert_resolver.add_cert_path(key_path, cert_path, true)?; + let mut cert_resolver = CertResolver::parse_new(key_path, cert_path)?; // add extra certificates if let Some(certs_dir) = certs_dir { @@ -40,11 +39,8 @@ pub fn configure_tls( let key_path = path.join("tls.key"); let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { - cert_resolver.add_cert_path( - &key_path.to_string_lossy(), - &cert_path.to_string_lossy(), - false, - )?; + cert_resolver + .add_cert_path(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?; } } } @@ -83,92 +79,42 @@ pub fn configure_tls( }) } -#[derive(Default, Debug)] +#[derive(Debug)] pub struct CertResolver { certs: HashMap, TlsServerEndPoint)>, - default: Option<(Arc, TlsServerEndPoint)>, + default: (Arc, TlsServerEndPoint), } impl CertResolver { - pub fn new() -> Self { - Self::default() + fn parse_new(key_path: &str, cert_path: &str) -> anyhow::Result { + let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; + Self::new(priv_key, cert_chain) } - fn add_cert_path( - &mut self, - key_path: &str, - cert_path: &str, - is_default: bool, - ) -> anyhow::Result<()> { - let priv_key = { - let key_bytes = std::fs::read(key_path) - .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; - rustls_pemfile::private_key(&mut &key_bytes[..]) - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? - }; + pub fn new( + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, + ) -> anyhow::Result { + let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?; - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - self.add_cert(priv_key, cert_chain, is_default) + let mut certs = HashMap::new(); + let default = (cert.clone(), tls_server_end_point); + certs.insert(common_name, (cert, tls_server_end_point)); + Ok(Self { certs, default }) } - pub fn add_cert( + fn add_cert_path(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> { + let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; + self.add_cert(priv_key, cert_chain) + } + + fn add_cert( &mut self, priv_key: PrivateKeyDer<'static>, cert_chain: Vec>, - is_default: bool, ) -> anyhow::Result<()> { - let key = sign::any_supported_type(&priv_key).context("invalid private key")?; - - let first_cert = &cert_chain[0]; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - - let certificate = SliceReader::new(first_cert) - .context("Failed to parse cerficiate")? - .decode::() - .context("Failed to parse cerficiate")?; - - let common_name = certificate.tbs_certificate.subject.to_string(); - - // We need to get the canonical name for this certificate so we can match them against any domain names - // seen within the proxy codebase. - // - // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. - // We need to remove the wildcard prefix for the purposes of certificate selection. - // - // auth-broker does not use SNI and instead uses the Neon-Connection-String header. - // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. - // - // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string - // validation, so let's we can continue with any common-name - let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=") { - s.to_string() - } else { - bail!("Failed to parse common name from certificate") - }; - - let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); - - if is_default { - self.default = Some((cert.clone(), tls_server_end_point)); - } - + let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?; self.certs.insert(common_name, (cert, tls_server_end_point)); - Ok(()) } @@ -177,12 +123,82 @@ impl CertResolver { } } +fn parse_key_cert( + key_path: &str, + cert_path: &str, +) -> anyhow::Result<(PrivateKeyDer<'static>, Vec>)> { + let priv_key = { + let key_bytes = std::fs::read(key_path) + .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; + rustls_pemfile::private_key(&mut &key_bytes[..]) + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + }; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!( + "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + ) + })? + }; + + Ok((priv_key, cert_chain)) +} + +fn process_key_cert( + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, +) -> anyhow::Result<(String, Arc, TlsServerEndPoint)> { + let key = sign::any_supported_type(&priv_key).context("invalid private key")?; + + let first_cert = &cert_chain[0]; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let certificate = SliceReader::new(first_cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; + + let common_name = certificate.tbs_certificate.subject.to_string(); + + // We need to get the canonical name for this certificate so we can match them against any domain names + // seen within the proxy codebase. + // + // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. + // We need to remove the wildcard prefix for the purposes of certificate selection. + // + // auth-broker does not use SNI and instead uses the Neon-Connection-String header. + // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. + // + // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string + // validation, so let's we can continue with any common-name + let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=") { + s.to_string() + } else { + bail!("Failed to parse common name from certificate") + }; + + let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); + + Ok((common_name, cert, tls_server_end_point)) +} + impl rustls::server::ResolvesServerCert for CertResolver { fn resolve( &self, client_hello: rustls::server::ClientHello<'_>, ) -> Option> { - self.resolve(client_hello.server_name()).map(|x| x.0) + Some(self.resolve(client_hello.server_name()).0) } } @@ -190,7 +206,7 @@ impl CertResolver { pub fn resolve( &self, server_name: Option<&str>, - ) -> Option<(Arc, TlsServerEndPoint)> { + ) -> (Arc, TlsServerEndPoint) { // loop here and cut off more and more subdomains until we find // a match to get a proper wildcard support. OTOH, we now do not // use nested domains, so keep this simple for now. @@ -200,12 +216,17 @@ impl CertResolver { if let Some(mut sni_name) = server_name { loop { if let Some(cert) = self.certs.get(sni_name) { - return Some(cert.clone()); + return cert.clone(); } if let Some((_, rest)) = sni_name.split_once('.') { sni_name = rest; } else { - return None; + // The customer has some custom DNS mapping - just return + // a default certificate. + // + // This will error if the customer uses anything stronger + // than sslmode=require. That's a choice they can make. + return self.default.clone(); } } } else { From 0b243242df493cf3a60de6d0ec1a0d1491c3cd4c Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 5 May 2025 20:15:22 +0800 Subject: [PATCH 22/40] fix(test): allow flush error in gc-compaction tests (#11822) ## Problem Part of https://github.com/neondatabase/neon/issues/11762 ## Summary of changes While #11762 needs some work to refactor the error propagating thing, we can do a hacky fix for the gc-compaction tests to allow flush error during shutdown. It does not affect correctness. Signed-off-by: Alex Chi Z --- test_runner/regress/test_compaction.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 0dfc665a1d..370f57b19d 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -202,6 +202,8 @@ def test_pageserver_gc_compaction_preempt( env = neon_env_builder.init_start(initial_tenant_conf=conf) env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*") + env.pageserver.allowed_errors.append(".*flush task cancelled.*") + env.pageserver.allowed_errors.append(".*failed to pipe.*") tenant_id = env.initial_tenant timeline_id = env.initial_timeline From baf425a2cde7a92873b548115847445f3a43f6b0 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 5 May 2025 15:06:37 +0200 Subject: [PATCH 23/40] [pageserver/virtual_file] impr: Improve OpenOptions API ergonomics (#11789) # Improve OpenOptions API ergonomics Closes #11787 This PR improves the OpenOptions API ergonomics by: 1. Making OpenOptions methods take and return owned Self instead of &mut self 2. Changing VirtualFile::open_with_options_v2 to take an owned OpenOptions 3. Removing unnecessary .clone() and .to_owned() calls These changes make the API more idiomatic Rust by leveraging the builder pattern with owned values, which is cleaner and more ergonomic than the previous approach. Link to Devin run: https://app.devin.ai/sessions/c2a4b24f7aca40a3b3777f4259bf8ee1 Requested by: christian@neon.tech --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: christian@neon.tech --- pageserver/src/virtual_file.rs | 42 ++++++++------------- pageserver/src/virtual_file/open_options.rs | 17 ++++----- 2 files changed, 23 insertions(+), 36 deletions(-) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 58953407b1..f429e59ef3 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -14,8 +14,6 @@ use std::fs::File; use std::io::{Error, ErrorKind}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -#[cfg(target_os = "linux")] -use std::os::unix::fs::OpenOptionsExt; use std::sync::LazyLock; use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; @@ -99,7 +97,7 @@ impl VirtualFile { pub async fn open_with_options_v2>( path: P, - open_options: &OpenOptions, + #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut open_options: OpenOptions, ctx: &RequestContext, ) -> Result { let mode = get_io_mode(); @@ -112,21 +110,16 @@ impl VirtualFile { #[cfg(target_os = "linux")] (IoMode::DirectRw, _) => true, }; - let open_options = open_options.clone(); - let open_options = if set_o_direct { + if set_o_direct { #[cfg(target_os = "linux")] { - let mut open_options = open_options; - open_options.custom_flags(nix::libc::O_DIRECT); - open_options + open_options = open_options.custom_flags(nix::libc::O_DIRECT); } #[cfg(not(target_os = "linux"))] unreachable!( "O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined" ); - } else { - open_options - }; + } let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; Ok(VirtualFile { inner, _mode: mode }) } @@ -530,7 +523,7 @@ impl VirtualFileInner { path: P, ctx: &RequestContext, ) -> Result { - Self::open_with_options(path.as_ref(), OpenOptions::new().read(true).clone(), ctx).await + Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await } /// Open a file with given options. @@ -558,10 +551,11 @@ impl VirtualFileInner { // It would perhaps be nicer to check just for the read and write flags // explicitly, but OpenOptions doesn't contain any functions to read flags, // only to set them. - let mut reopen_options = open_options.clone(); - reopen_options.create(false); - reopen_options.create_new(false); - reopen_options.truncate(false); + let reopen_options = open_options + .clone() + .create(false) + .create_new(false) + .truncate(false); let vfile = VirtualFileInner { handle: RwLock::new(handle), @@ -1307,7 +1301,7 @@ mod tests { opts: OpenOptions, ctx: &RequestContext, ) -> Result { - let vf = VirtualFile::open_with_options_v2(&path, &opts, ctx).await?; + let vf = VirtualFile::open_with_options_v2(&path, opts, ctx).await?; Ok(MaybeVirtualFile::VirtualFile(vf)) } } @@ -1374,7 +1368,7 @@ mod tests { let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err(); // Close the file and re-open for reading - let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?; + let mut file_a = A::open(path_a, OpenOptions::new().read(true), &ctx).await?; // cannot write to a file opened in read-only mode let _ = file_a @@ -1393,8 +1387,7 @@ mod tests { .read(true) .write(true) .create(true) - .truncate(true) - .to_owned(), + .truncate(true), &ctx, ) .await?; @@ -1412,12 +1405,7 @@ mod tests { let mut vfiles = Vec::new(); for _ in 0..100 { - let mut vfile = A::open( - path_b.clone(), - OpenOptions::new().read(true).to_owned(), - &ctx, - ) - .await?; + let mut vfile = A::open(path_b.clone(), OpenOptions::new().read(true), &ctx).await?; assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?); vfiles.push(vfile); } @@ -1466,7 +1454,7 @@ mod tests { for _ in 0..VIRTUAL_FILES { let f = VirtualFileInner::open_with_options( &test_file_path, - OpenOptions::new().read(true).clone(), + OpenOptions::new().read(true), &ctx, ) .await?; diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 7d323f3d8f..2a7bb693f2 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -1,6 +1,7 @@ //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; use std::os::fd::OwnedFd; +use std::os::unix::fs::OpenOptionsExt; use std::path::Path; use super::io_engine::IoEngine; @@ -43,7 +44,7 @@ impl OpenOptions { self.write } - pub fn read(&mut self, read: bool) -> &mut OpenOptions { + pub fn read(mut self, read: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.read(read); @@ -56,7 +57,7 @@ impl OpenOptions { self } - pub fn write(&mut self, write: bool) -> &mut OpenOptions { + pub fn write(mut self, write: bool) -> Self { self.write = write; match &mut self.inner { Inner::StdFs(x) => { @@ -70,7 +71,7 @@ impl OpenOptions { self } - pub fn create(&mut self, create: bool) -> &mut OpenOptions { + pub fn create(mut self, create: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.create(create); @@ -83,7 +84,7 @@ impl OpenOptions { self } - pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions { + pub fn create_new(mut self, create_new: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.create_new(create_new); @@ -96,7 +97,7 @@ impl OpenOptions { self } - pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions { + pub fn truncate(mut self, truncate: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.truncate(truncate); @@ -124,10 +125,8 @@ impl OpenOptions { } } } -} -impl std::os::unix::prelude::OpenOptionsExt for OpenOptions { - fn mode(&mut self, mode: u32) -> &mut OpenOptions { + pub fn mode(mut self, mode: u32) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.mode(mode); @@ -140,7 +139,7 @@ impl std::os::unix::prelude::OpenOptionsExt for OpenOptions { self } - fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions { + pub fn custom_flags(mut self, flags: i32) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.custom_flags(flags); From cb67f9a6517652d21917a77f61b3e466f992d901 Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Mon, 5 May 2025 16:30:13 +0200 Subject: [PATCH 24/40] delete orphan left over projects (#11826) ## Problem sometimes our benchmarking GitHub workflow is terminated by side-effects beyond our control (e.g. GitHub runner looses connection to server) and then we have left-over Neon projects created during the workflow [Example where GitHub runner lost connection and project was not deleted](https://github.com/neondatabase/neon/actions/runs/14017400543/job/39244816485) Fixes https://github.com/neondatabase/cloud/issues/28546 ## Summary of changes - Add a cleanup step that cleans up left-over projects - also give each project created during workflows a name that references the testcase and GitHub runid ## Example run (test of new job steps) https://github.com/neondatabase/neon/actions/runs/14837092399/job/41650741922#step:6:63 --------- Co-authored-by: a-masterov <72613290+a-masterov@users.noreply.github.com> --- .github/workflows/benchmarking.yml | 71 +++++++++++++++++++ .../test_cumulative_statistics_persistence.py | 6 +- .../performance/test_physical_replication.py | 8 ++- 3 files changed, 82 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 5107f457e2..220d7905b1 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -53,6 +53,77 @@ concurrency: cancel-in-progress: true jobs: + cleanup: + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: ghcr.io/neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --init + env: + ORG_ID: org-solitary-dew-09443886 + LIMIT: 100 + SEARCH: "Created by actions/neon-project-create; GITHUB_RUN_ID" + BASE_URL: https://console-stage.neon.build/api/v2 + DRY_RUN: "false" # Set to "true" to just test out the workflow + + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Cleanup inactive Neon projects left over from prior runs + env: + API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + run: | + set -euo pipefail + + NOW=$(date -u +%s) + DAYS_AGO=$((NOW - 5 * 86400)) + + REQUEST_URL="$BASE_URL/projects?limit=$LIMIT&search=$(printf '%s' "$SEARCH" | jq -sRr @uri)&org_id=$ORG_ID" + + echo "Requesting project list from:" + echo "$REQUEST_URL" + + response=$(curl -s -X GET "$REQUEST_URL" \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" ) + + echo "Response:" + echo "$response" | jq . + + projects_to_delete=$(echo "$response" | jq --argjson cutoff "$DAYS_AGO" ' + .projects[] + | select(.compute_last_active_at != null) + | select((.compute_last_active_at | fromdateiso8601) < $cutoff) + | {id, name, compute_last_active_at} + ') + + if [ -z "$projects_to_delete" ]; then + echo "No projects eligible for deletion." + exit 0 + fi + + echo "Projects that will be deleted:" + echo "$projects_to_delete" | jq -r '.id' + + if [ "$DRY_RUN" = "false" ]; then + echo "$projects_to_delete" | jq -r '.id' | while read -r project_id; do + echo "Deleting project: $project_id" + curl -s -X DELETE "$BASE_URL/projects/$project_id" \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" + done + else + echo "Dry run enabled — no projects were deleted." + fi bench: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} permissions: diff --git a/test_runner/performance/test_cumulative_statistics_persistence.py b/test_runner/performance/test_cumulative_statistics_persistence.py index 061467bbad..5e9e55cb0f 100644 --- a/test_runner/performance/test_cumulative_statistics_persistence.py +++ b/test_runner/performance/test_cumulative_statistics_persistence.py @@ -1,4 +1,5 @@ import math # Add this import +import os import time import traceback from pathlib import Path @@ -87,7 +88,10 @@ def test_cumulative_statistics_persistence( - insert additional tuples that by itself are not enough to trigger auto-vacuum but in combination with the previous tuples are - verify that autovacuum is triggered by the combination of tuples inserted before and after endpoint suspension """ - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, + f"Test cumulative statistics persistence, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}", + ) project_id = project["project"]["id"] neon_api.wait_for_operation_to_finish(project_id) endpoint_id = project["endpoints"][0]["id"] diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index bdafa2d657..c580bfcc14 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -62,7 +62,9 @@ def test_ro_replica_lag( pgbench_duration = f"-T{test_duration_min * 60 * 2}" - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, f"Test readonly replica lag, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" + ) project_id = project["project"]["id"] log.info("Project ID: %s", project_id) log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"]) @@ -195,7 +197,9 @@ def test_replication_start_stop( pgbench_duration = f"-T{2**num_replicas * configuration_test_time_sec}" error_occurred = False - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, f"Test replication start stop, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" + ) project_id = project["project"]["id"] log.info("Project ID: %s", project_id) log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"]) From 16ca74a3f41789a1136e1256a8b9e5ec59b33417 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 6 May 2025 09:49:23 +0300 Subject: [PATCH 25/40] Add SAFETY comment on libc::sysconf() call (#11581) I got an 'undocumented_unsafe_blocks' clippy warning about it. Not sure why I got the warning now and not before, but in any case a comment is a good idea. --- libs/metrics/src/more_process_metrics.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs index 13a745e031..f91800685f 100644 --- a/libs/metrics/src/more_process_metrics.rs +++ b/libs/metrics/src/more_process_metrics.rs @@ -16,6 +16,7 @@ pub struct Collector { const NMETRICS: usize = 2; static CLK_TCK_F64: Lazy = Lazy::new(|| { + // SAFETY: libc::sysconf is safe, it merely returns a value. let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) }; if long == -1 { panic!("sysconf(_SC_CLK_TCK) failed"); From c6ff18affccc73372078c393163f06a88b871820 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Tue, 6 May 2025 10:51:51 +0400 Subject: [PATCH 26/40] cosmetics(pgxn/neon): WP code small clean up (#11824) ## Problem Some small cosmetic changes I made while reading the code. Should not affect anything. ## Summary of changes - Remove `n_votes` field because it's not used anymore - Explicitly initialize `safekeepers_generation` with `INVALID_GENERATION` if the generation is not present (the struct is zero-initialized anyway, but the explicit initialization is better IMHO) - Access SafekeeperId via pointer `sk_id` created above --- pgxn/neon/neon_walreader.c | 2 +- pgxn/neon/walproposer.c | 6 +++--- pgxn/neon/walproposer.h | 3 --- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index be2c4ddf79..d5e3a38dbb 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -150,7 +150,7 @@ NeonWALReaderFree(NeonWALReader *state) * fetched from timeline 'tli'. * * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error - * occurs, in which case 'err' has the desciption. Error always closes remote + * occurs, in which case 'err' has the description. Error always closes remote * connection, if there was any, so socket subscription should be removed. * * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index f4f1398375..3befb42030 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -124,6 +124,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) } else { + wp->safekeepers_generation = INVALID_GENERATION; host = wp->config->safekeepers_list; } wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation); @@ -756,7 +757,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) { SafekeeperId *sk_id = &wp->mconf.members.m[i]; - if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId) + if (sk_id->node_id == sk->greetResponse.nodeId) { /* * If mconf or list of safekeepers to connect to changed (the @@ -781,7 +782,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) { SafekeeperId *sk_id = &wp->mconf.new_members.m[i]; - if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId) + if (sk_id->node_id == sk->greetResponse.nodeId) { if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk) { @@ -1071,7 +1072,6 @@ RecvVoteResponse(Safekeeper *sk) /* ready for elected message */ sk->state = SS_WAIT_ELECTED; - wp->n_votes++; /* Are we already elected? */ if (wp->state == WPS_CAMPAIGN) { diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 648b0015ad..83ef72d3d7 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -845,9 +845,6 @@ typedef struct WalProposer /* timeline globally starts at this LSN */ XLogRecPtr timelineStartLsn; - /* number of votes collected from safekeepers */ - int n_votes; - /* number of successful connections over the lifetime of walproposer */ int n_connected; From f0e7b3e0efc964539c9561b5976922e9c1cccbcb Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 6 May 2025 10:24:27 +0300 Subject: [PATCH 27/40] Use unlogged build for gist_indexsortbuild_flush_ready_pages (#11753) ## Problem See https://github.com/neondatabase/neon/issues/11718 GIST index can be constructed in two ways: GIST_SORTED_BUILD and GIST_BUFFERING. We used unlogged build in the second case but not in the first. ## Summary of changes Use unlogged build in `gist_indexsortbuild_flush_ready_pages` Correspondent Postgres PRsL: https://github.com/neondatabase/postgres/pull/624 https://github.com/neondatabase/postgres/pull/625 https://github.com/neondatabase/postgres/pull/626 --------- Co-authored-by: Konstantin Knizhnik Co-authored-by: Heikki Linnakangas --- pgxn/neon/pagestore_smgr.c | 6 ++++++ test_runner/regress/test_gist.py | 28 ++++++++++++++++++++++++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/revisions.json | 6 +++--- 6 files changed, 40 insertions(+), 6 deletions(-) create mode 100644 test_runner/regress/test_gist.py diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 3bf0bedf99..87eb420717 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1989,8 +1989,14 @@ neon_start_unlogged_build(SMgrRelation reln) neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } +#if PG_MAJORVERSION_NUM >= 17 + /* + * We have to disable this check for pg14-16 because sorted build of GIST index requires + * to perform unlogged build several times + */ if (smgrnblocks(reln, MAIN_FORKNUM) != 0) neon_log(ERROR, "cannot perform unlogged index build, index is not empty "); +#endif unlogged_build_rel = reln; unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; diff --git a/test_runner/regress/test_gist.py b/test_runner/regress/test_gist.py new file mode 100644 index 0000000000..89e3b9b2b1 --- /dev/null +++ b/test_runner/regress/test_gist.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + + +# +# Test unlogged build for GIST index +# +def test_gist(neon_simple_env: NeonEnv): + env = neon_simple_env + endpoint = env.endpoints.create_start("main") + con = endpoint.connect() + cur = con.cursor() + iterations = 100 + + for _ in range(iterations): + cur.execute( + "CREATE TABLE pvactst (i INT, a INT[], p POINT) with (autovacuum_enabled = off)" + ) + cur.execute( + "INSERT INTO pvactst SELECT i, array[1,2,3], point(i, i+1) FROM generate_series(1,1000) i" + ) + cur.execute("CREATE INDEX gist_pvactst ON pvactst USING gist (p)") + cur.execute("VACUUM pvactst") + cur.execute("DROP TABLE pvactst") diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index d3c9d61fb7..c8dab02bfc 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit d3c9d61fb7a362a165dac7060819dd9d6ad68c28 +Subproject commit c8dab02bfc003ae7bd59096919042d7840f3c194 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 8ecb12f21d..b838c8969b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3 +Subproject commit b838c8969b7c63f3e637a769656f5f36793b797c diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 37496f87b5..05ddf212e2 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 37496f87b5324af53c56127e278ee5b1e8435253 +Subproject commit 05ddf212e2e07b788b5c8b88bdcf98630941f6ae diff --git a/vendor/revisions.json b/vendor/revisions.json index 90d878d0f7..74a6ff33d7 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -5,14 +5,14 @@ ], "v16": [ "16.8", - "37496f87b5324af53c56127e278ee5b1e8435253" + "05ddf212e2e07b788b5c8b88bdcf98630941f6ae" ], "v15": [ "15.12", - "8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3" + "b838c8969b7c63f3e637a769656f5f36793b797c" ], "v14": [ "14.17", - "d3c9d61fb7a362a165dac7060819dd9d6ad68c28" + "c8dab02bfc003ae7bd59096919042d7840f3c194" ] } From 62ac5b94b3842322f267df28ff90ad3c9d4060a7 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 6 May 2025 09:28:25 +0000 Subject: [PATCH 28/40] proxy: Include the exp/nbf timestamps in the errors (#11828) ## Problem It's difficult to tell when the JWT expired from current logs and error messages. ## Summary of changes Add exp/nbf timestamps to the respective error variants. Also use checked_add when deserializing a SystemTime from JWT. Related to INC-509 --- proxy/src/auth/backend/jwt.rs | 41 ++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 44a6a42665..a48f67199a 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -409,14 +409,22 @@ impl JwkCacheEntryLock { if let Some(exp) = payload.expiration { if now >= exp + CLOCK_SKEW_LEEWAY { - return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired)); + return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired( + exp.duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ))); } } if let Some(nbf) = payload.not_before { if nbf >= now + CLOCK_SKEW_LEEWAY { return Err(JwtError::InvalidClaims( - JwtClaimsError::JwtTokenNotYetReadyToUse, + JwtClaimsError::JwtTokenNotYetReadyToUse( + nbf.duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ), )); } } @@ -534,10 +542,10 @@ struct JwtPayload<'a> { #[serde(rename = "aud", default)] audience: OneOrMany, /// Expiration - Time after which the JWT expires - #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)] + #[serde(rename = "exp", deserialize_with = "numeric_date_opt", default)] expiration: Option, - /// Not before - Time after which the JWT expires - #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)] + /// Not before - Time before which the JWT is not valid + #[serde(rename = "nbf", deserialize_with = "numeric_date_opt", default)] not_before: Option, // the following entries are only extracted for the sake of debug logging. @@ -609,8 +617,15 @@ impl<'de> Deserialize<'de> for OneOrMany { } fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { - let d = >::deserialize(d)?; - Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n))) + >::deserialize(d)? + .map(|t| { + SystemTime::UNIX_EPOCH + .checked_add(Duration::from_secs(t)) + .ok_or_else(|| { + serde::de::Error::custom(format_args!("timestamp out of bounds: {t}")) + }) + }) + .transpose() } struct JwkRenewalPermit<'a> { @@ -746,11 +761,11 @@ pub enum JwtClaimsError { #[error("invalid JWT token audience")] InvalidJwtTokenAudience, - #[error("JWT token has expired")] - JwtTokenHasExpired, + #[error("JWT token has expired (exp={0})")] + JwtTokenHasExpired(u64), - #[error("JWT token is not yet ready to use")] - JwtTokenNotYetReadyToUse, + #[error("JWT token is not yet ready to use (nbf={0})")] + JwtTokenNotYetReadyToUse(u64), } #[allow(dead_code, reason = "Debug use only")] @@ -1233,14 +1248,14 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL "nbf": now + 60, "aud": "neon", }}, - error: JwtClaimsError::JwtTokenNotYetReadyToUse, + error: JwtClaimsError::JwtTokenNotYetReadyToUse(now + 60), }, Test { body: json! {{ "exp": now - 60, "aud": ["neon"], }}, - error: JwtClaimsError::JwtTokenHasExpired, + error: JwtClaimsError::JwtTokenHasExpired(now - 60), }, Test { body: json! {{ From 50dc2fae771c6720b77eeb431e90a2cb300b9b5c Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 6 May 2025 11:52:21 +0100 Subject: [PATCH 29/40] compute-node.Dockerfile: remove layer with duplicated name (#11807) ## Problem Two `rust-extensions-build-pgrx14` layers were added independently in two different PRs, and the layers are exactly the same ## Summary of changes - Remove one of `rust-extensions-build-pgrx14` layers --- compute/compute-node.Dockerfile | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index cc338cec6a..8766eb519e 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1084,23 +1084,12 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root + ######################################################################################### # # Layer "rust extensions pgrx14" # -######################################################################################### -FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14 -ARG PG_VERSION - -RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ - /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' - -USER root -######################################################################################### -# -# Layer "rust extensions pgrx14" -# -# Version 14 is now required by a few +# Version 14 is now required by a few # This layer should be used as a base for new pgrx extensions, # and eventually get merged with `rust-extensions-build` # From c82e363ed90742214de1bf5efb7a721019e89d42 Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Tue, 6 May 2025 14:26:13 +0200 Subject: [PATCH 30/40] cleanup orphan projects created by python tests, too (#11836) ## Problem - some projects are created during GitHub workflows but not by action project_create but by python test scripts. If the python test fails the project is not deleted ## Summary of changes - make sure we cleanup those python created projects a few days after they are no longer used, too --- .github/workflows/benchmarking.yml | 2 +- test_runner/random_ops/test_random_ops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 220d7905b1..79371ec704 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -64,7 +64,7 @@ jobs: env: ORG_ID: org-solitary-dew-09443886 LIMIT: 100 - SEARCH: "Created by actions/neon-project-create; GITHUB_RUN_ID" + SEARCH: "GITHUB_RUN_ID=" BASE_URL: https://console-stage.neon.build/api/v2 DRY_RUN: "false" # Set to "true" to just test out the workflow diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py index 643151fa11..645c9b7b9d 100644 --- a/test_runner/random_ops/test_random_ops.py +++ b/test_runner/random_ops/test_random_ops.py @@ -206,7 +206,7 @@ class NeonProject: self.neon_api = neon_api self.pg_bin = pg_bin proj = self.neon_api.create_project( - pg_version, f"Automatic random API test {os.getenv('GITHUB_RUN_ID')}" + pg_version, f"Automatic random API test GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" ) self.id: str = proj["project"]["id"] self.name: str = proj["project"]["name"] From 6827f2f58ccb8dec0922d0a2c7a413998cf2f539 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 6 May 2025 20:27:16 +0800 Subject: [PATCH 31/40] fix(pageserver): only keep `iter_with_options` API, improve docs in gc-compact (#11804) ## Problem Address comments in https://github.com/neondatabase/neon/pull/11709 ## Summary of changes - remove `iter` API, users always need to specify buffer size depending on the expected memory usage. - several doc improvements --------- Signed-off-by: Alex Chi Z Co-authored-by: Christian Schwarz --- .../src/tenant/storage_layer/delta_layer.rs | 15 +----- .../tenant/storage_layer/filter_iterator.rs | 4 +- .../src/tenant/storage_layer/image_layer.rs | 15 +----- .../tenant/storage_layer/merge_iterator.rs | 46 +++++++++++-------- pageserver/src/tenant/timeline/compaction.rs | 13 ++++-- 5 files changed, 42 insertions(+), 51 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 607b0d513c..11875ac653 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -1441,14 +1441,6 @@ impl DeltaLayerInner { offset } - pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> { - self.iter_with_options( - ctx, - 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. - 1024, // The default value. Unit tests might use a different value - ) - } - pub fn iter_with_options<'a>( &'a self, ctx: &'a RequestContext, @@ -1634,7 +1626,6 @@ pub(crate) mod test { use crate::tenant::disk_btree::tests::TestDisk; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; - use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{TenantShard, Timeline}; /// Construct an index for a fictional delta layer and and then @@ -2311,8 +2302,7 @@ pub(crate) mod test { for batch_size in [1, 2, 4, 8, 3, 7, 13] { println!("running with batch_size={batch_size} max_read_size={max_read_size}"); // Test if the batch size is correctly determined - let mut iter = delta_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size); let mut num_items = 0; for _ in 0..3 { iter.next_batch().await.unwrap(); @@ -2329,8 +2319,7 @@ pub(crate) mod test { iter.key_values_batch.clear(); } // Test if the result is correct - let mut iter = delta_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size); assert_delta_iter_equal(&mut iter, &test_deltas).await; } } diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs index 8d172a1c19..1a330ecfc2 100644 --- a/pageserver/src/tenant/storage_layer/filter_iterator.rs +++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs @@ -157,7 +157,7 @@ mod tests { .await .unwrap(); - let merge_iter = MergeIterator::create( + let merge_iter = MergeIterator::create_for_testing( &[resident_layer_1.get_as_delta(&ctx).await.unwrap()], &[], &ctx, @@ -182,7 +182,7 @@ mod tests { result.extend(test_deltas1[90..100].iter().cloned()); assert_filter_iter_equal(&mut filter_iter, &result).await; - let merge_iter = MergeIterator::create( + let merge_iter = MergeIterator::create_for_testing( &[resident_layer_1.get_as_delta(&ctx).await.unwrap()], &[], &ctx, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 2f7c5715bb..d684230572 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -684,14 +684,6 @@ impl ImageLayerInner { } } - pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> { - self.iter_with_options( - ctx, - 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. - 1024, // The default value. Unit tests might use a different value - ) - } - pub(crate) fn iter_with_options<'a>( &'a self, ctx: &'a RequestContext, @@ -1240,7 +1232,6 @@ mod test { use crate::context::RequestContext; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; - use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{TenantShard, Timeline}; #[tokio::test] @@ -1507,8 +1498,7 @@ mod test { for batch_size in [1, 2, 4, 8, 3, 7, 13] { println!("running with batch_size={batch_size} max_read_size={max_read_size}"); // Test if the batch size is correctly determined - let mut iter = img_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size); let mut num_items = 0; for _ in 0..3 { iter.next_batch().await.unwrap(); @@ -1525,8 +1515,7 @@ mod test { iter.key_values_batch.clear(); } // Test if the result is correct - let mut iter = img_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size); assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await; } } diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index e084e3d567..ea3dea50c3 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -19,14 +19,6 @@ pub(crate) enum LayerRef<'a> { } impl<'a> LayerRef<'a> { - #[allow(dead_code)] - fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> { - match self { - Self::Image(x) => LayerIterRef::Image(x.iter(ctx)), - Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)), - } - } - fn iter_with_options( self, ctx: &'a RequestContext, @@ -322,6 +314,28 @@ impl MergeIteratorItem for ((Key, Lsn, Value), Arc) { } impl<'a> MergeIterator<'a> { + #[cfg(test)] + pub(crate) fn create_for_testing( + deltas: &[&'a DeltaLayerInner], + images: &[&'a ImageLayerInner], + ctx: &'a RequestContext, + ) -> Self { + Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024) + } + + /// Create a new merge iterator with custom options. + /// + /// Adjust `max_read_size` and `max_batch_size` to trade memory usage for performance. The size should scale + /// with the number of layers to compact. If there are a lot of layers, consider reducing the values, so that + /// the buffer does not take too much memory. + /// + /// The default options for L0 compactions are: + /// - max_read_size: 1024 * 8192 (8MB) + /// - max_batch_size: 1024 + /// + /// The default options for gc-compaction are: + /// - max_read_size: 128 * 8192 (1MB) + /// - max_batch_size: 128 pub fn create_with_options( deltas: &[&'a DeltaLayerInner], images: &[&'a ImageLayerInner], @@ -351,14 +365,6 @@ impl<'a> MergeIterator<'a> { } } - pub fn create( - deltas: &[&'a DeltaLayerInner], - images: &[&'a ImageLayerInner], - ctx: &'a RequestContext, - ) -> Self { - Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024) - } - pub(crate) async fn next_inner(&mut self) -> anyhow::Result> { while let Some(mut iter) = self.heap.peek_mut() { if !iter.is_loaded() { @@ -477,7 +483,7 @@ mod tests { let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) .await .unwrap(); - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_2.get_as_delta(&ctx).await.unwrap(), resident_layer_1.get_as_delta(&ctx).await.unwrap(), @@ -549,7 +555,7 @@ mod tests { let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) .await .unwrap(); - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_1.get_as_delta(&ctx).await.unwrap(), resident_layer_2.get_as_delta(&ctx).await.unwrap(), @@ -670,7 +676,7 @@ mod tests { // Test with different layer order for MergeIterator::create to ensure the order // is stable. - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_4.get_as_delta(&ctx).await.unwrap(), resident_layer_1.get_as_delta(&ctx).await.unwrap(), @@ -682,7 +688,7 @@ mod tests { ); assert_merge_iter_equal(&mut merge_iter, &expect).await; - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_1.get_as_delta(&ctx).await.unwrap(), resident_layer_4.get_as_delta(&ctx).await.unwrap(), diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9086d29d50..d0c13d86ce 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1994,7 +1994,13 @@ impl Timeline { let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; deltas.push(l); } - MergeIterator::create(&deltas, &[], ctx) + MergeIterator::create_with_options( + &deltas, + &[], + ctx, + 1024 * 8192, /* 8 MiB buffer per layer iterator */ + 1024, + ) }; // This iterator walks through all keys and is needed to calculate size used by each key @@ -2828,7 +2834,7 @@ impl Timeline { Ok(()) } - /// Check if the memory usage is within the limit. + /// Check to bail out of gc compaction early if it would use too much memory. async fn check_memory_usage( self: &Arc, layer_selection: &[Layer], @@ -2841,7 +2847,8 @@ impl Timeline { let layer_desc = layer.layer_desc(); if layer_desc.is_delta() { // Delta layers at most have 1MB buffer; 3x to make it safe (there're deltas as large as 16KB). - // Multiply the layer size so that tests can pass. + // Scale it by target_layer_size_bytes so that tests can pass (some tests, e.g., `test_pageserver_gc_compaction_preempt + // use 3MB layer size and we need to account for that). estimated_memory_usage_mb += 3.0 * (layer_desc.file_size / target_layer_size_bytes) as f64; num_delta_layers += 1; From 0e0ad073bf609fbc38e86f4030f1902c2632c5f7 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 6 May 2025 15:57:34 +0200 Subject: [PATCH 32/40] storcon: fix split aborts removing other tenants (#11837) ## Problem When aborting a split, the code accidentally removes all other tenant shards from the in-memory map that have the same shard count as the aborted split, causing "tenant not found" errors. It will recover on a storcon restart, when it loads the persisted state. This issue has been present for at least a year. Resolves https://github.com/neondatabase/cloud/issues/28589. ## Summary of changes Only remove shards belonging to the relevant tenant when aborting a split. Also adds a regression test. --- storage_controller/src/service.rs | 3 ++- test_runner/regress/test_sharding.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 72379f0810..21c693af97 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5181,7 +5181,8 @@ impl Service { } // We don't expect any new_shard_count shards to exist here, but drop them just in case - tenants.retain(|_id, s| s.shard.count != *new_shard_count); + tenants + .retain(|id, s| !(id.tenant_id == *tenant_id && s.shard.count == *new_shard_count)); detach_locations }; diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 0bfc4b1d8c..4c9887fb92 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1334,6 +1334,13 @@ def test_sharding_split_failures( tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}' ) + # Create bystander tenants with various shard counts. They should not be affected by the aborted + # splits. Regression test for https://github.com/neondatabase/cloud/issues/28589. + bystanders = {} # id → shard_count + for bystander_shard_count in [1, 2, 4, 8]: + id, _ = env.create_tenant(shard_count=bystander_shard_count) + bystanders[id] = bystander_shard_count + env.storage_controller.allowed_errors.extend( [ # All split failures log a warning when then enqueue the abort operation @@ -1394,6 +1401,8 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == initial_shard_count if loc[1]["mode"] == "Secondary": @@ -1414,6 +1423,8 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == split_shard_count if loc[1]["mode"] == "Secondary": @@ -1496,6 +1507,12 @@ def test_sharding_split_failures( # the scheduler reaches an idle state env.storage_controller.reconcile_until_idle(timeout_secs=30) + # Check that all bystanders are still around. + for bystander_id, bystander_shard_count in bystanders.items(): + response = env.storage_controller.tenant_describe(bystander_id) + assert TenantId(response["tenant_id"]) == bystander_id + assert len(response["shards"]) == bystander_shard_count + env.storage_controller.consistency_check() From 79ee78ea32754ed2b1cfa360c55a6a2aa11ad871 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?= <31549762+mrl5@users.noreply.github.com> Date: Tue, 6 May 2025 17:18:50 +0200 Subject: [PATCH 33/40] feat(compute): enable audit logs for pg_session_jwt extension (#11829) related to https://github.com/neondatabase/cloud/issues/28480 related to https://github.com/neondatabase/pg_session_jwt/pull/36 cc @MihaiBojin @conradludgate @lneves12 --- compute/compute-node.Dockerfile | 4 ++-- compute_tools/src/config.rs | 3 +++ .../ext-src/pg_session_jwt-src/expected/basic_functions.out | 1 + proxy/src/serverless/local_conn_pool.rs | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 8766eb519e..8bdf5cb7d1 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1322,8 +1322,8 @@ ARG PG_VERSION # Do not update without approve from proxy team # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs WORKDIR /ext-src -RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \ - echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.1.tar.gz -O pg_session_jwt.tar.gz && \ + echo "62fec9e472cb805c53ba24a0765afdb8ea2720cfc03ae7813e61687b36d1b0ad pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \ diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 71c6123c3b..42d245f55a 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -223,6 +223,9 @@ pub fn write_postgres_conf( // TODO: tune this after performance testing writeln!(file, "pgaudit.log_rotation_age=5")?; + // Enable audit logs for pg_session_jwt extension + writeln!(file, "pg_session_jwt.audit_log=on")?; + // Add audit shared_preload_libraries, if they are not present. // // The caller who sets the flag is responsible for ensuring that the necessary diff --git a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out index ca54864ecd..ff6a7404cb 100644 --- a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out +++ b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out @@ -12,6 +12,7 @@ ERROR: invalid JWT encoding -- Test creating a session with an expired JWT SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw'); ERROR: Token used after it has expired +DETAIL: exp=1742564432 -- Test creating a session with a valid JWT SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg'); jwt_session_init diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 1d9b35f41d..bb5637cd5f 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -41,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; pub(crate) const EXT_NAME: &str = "pg_session_jwt"; -pub(crate) const EXT_VERSION: &str = "0.3.0"; +pub(crate) const EXT_VERSION: &str = "0.3.1"; pub(crate) const EXT_SCHEMA: &str = "auth"; #[derive(Clone)] From f9b3a2e059c4c1fd764da011e7a5364d86e60491 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 6 May 2025 14:51:10 -0500 Subject: [PATCH 34/40] Add scoping to compute_ctl JWT claims (#11639) Currently we only have an admin scope which allows a user to bypass the compute_id check. When the admin scope is provided, validate the audience of the JWT to be "compute". Closes: https://github.com/neondatabase/cloud/issues/27614 Signed-off-by: Tristan Partin --- .../src/http/middleware/authorize.rs | 59 +++++++++++--- control_plane/src/bin/neon_local.rs | 20 +++-- control_plane/src/endpoint.rs | 20 +++-- libs/compute_api/src/requests.rs | 41 +++++++++- test_runner/fixtures/endpoint/http.py | 12 +++ test_runner/fixtures/neon_cli.py | 7 +- test_runner/fixtures/neon_fixtures.py | 12 ++- test_runner/regress/test_compute_http.py | 78 +++++++++++++++++++ 8 files changed, 222 insertions(+), 27 deletions(-) create mode 100644 test_runner/regress/test_compute_http.py diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs index 2d0f411d7a..2afc57ad9c 100644 --- a/compute_tools/src/http/middleware/authorize.rs +++ b/compute_tools/src/http/middleware/authorize.rs @@ -1,12 +1,10 @@ -use std::collections::HashSet; - use anyhow::{Result, anyhow}; use axum::{RequestExt, body::Body}; use axum_extra::{ TypedHeader, headers::{Authorization, authorization::Bearer}, }; -use compute_api::requests::ComputeClaims; +use compute_api::requests::{COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope}; use futures::future::BoxFuture; use http::{Request, Response, StatusCode}; use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet}; @@ -25,13 +23,14 @@ pub(in crate::http) struct Authorize { impl Authorize { pub fn new(compute_id: String, jwks: JwkSet) -> Self { let mut validation = Validation::new(Algorithm::EdDSA); - // Nothing is currently required - validation.required_spec_claims = HashSet::new(); validation.validate_exp = true; // Unused by the control plane - validation.validate_aud = false; - // Unused by the control plane validation.validate_nbf = false; + // Unused by the control plane + validation.validate_aud = false; + validation.set_audience(&[COMPUTE_AUDIENCE]); + // Nothing is currently required + validation.set_required_spec_claims(&[] as &[&str; 0]); Self { compute_id, @@ -64,11 +63,47 @@ impl AsyncAuthorizeRequest for Authorize { Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)), }; - if data.claims.compute_id != compute_id { - return Err(JsonResponse::error( - StatusCode::UNAUTHORIZED, - "invalid compute ID in authorization token claims", - )); + match data.claims.scope { + // TODO: We should validate audience for every token, but + // instead of this ad-hoc validation, we should turn + // [`Validation::validate_aud`] on. This is merely a stopgap + // while we roll out `aud` deployment. We return a 401 + // Unauthorized because when we eventually do use + // [`Validation`], we will hit the above `Err` match arm which + // returns 401 Unauthorized. + Some(ComputeClaimsScope::Admin) => { + let Some(ref audience) = data.claims.audience else { + return Err(JsonResponse::error( + StatusCode::UNAUTHORIZED, + "missing audience in authorization token claims", + )); + }; + + if audience != COMPUTE_AUDIENCE { + return Err(JsonResponse::error( + StatusCode::UNAUTHORIZED, + "invalid audience in authorization token claims", + )); + } + } + + // If the scope is not [`ComputeClaimsScope::Admin`], then we + // must validate the compute_id + _ => { + let Some(ref claimed_compute_id) = data.claims.compute_id else { + return Err(JsonResponse::error( + StatusCode::FORBIDDEN, + "missing compute_id in authorization token claims", + )); + }; + + if *claimed_compute_id != compute_id { + return Err(JsonResponse::error( + StatusCode::FORBIDDEN, + "invalid compute ID in authorization token claims", + )); + } + } } // Make claims available to any subsequent middleware or request diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 6f55c0310f..44698f7b23 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -16,6 +16,7 @@ use std::time::Duration; use anyhow::{Context, Result, anyhow, bail}; use clap::Parser; +use compute_api::requests::ComputeClaimsScope; use compute_api::spec::ComputeMode; use control_plane::broker::StorageBroker; use control_plane::endpoint::ComputeControlPlane; @@ -705,6 +706,9 @@ struct EndpointStopCmdArgs { struct EndpointGenerateJwtCmdArgs { #[clap(help = "Postgres endpoint id")] endpoint_id: String, + + #[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)] + scope: Option, } #[derive(clap::Subcommand)] @@ -1540,12 +1544,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res endpoint.stop(&args.mode, args.destroy)?; } EndpointCmd::GenerateJwt(args) => { - let endpoint_id = &args.endpoint_id; - let endpoint = cplane - .endpoints - .get(endpoint_id) - .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - let jwt = endpoint.generate_jwt()?; + let endpoint = { + let endpoint_id = &args.endpoint_id; + + cplane + .endpoints + .get(endpoint_id) + .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))? + }; + + let jwt = endpoint.generate_jwt(args.scope)?; print!("{jwt}"); } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 4071b620d6..0b16339a6f 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -45,7 +45,9 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use anyhow::{Context, Result, anyhow, bail}; -use compute_api::requests::{ComputeClaims, ConfigurationRequest}; +use compute_api::requests::{ + COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest, +}; use compute_api::responses::{ ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig, }; @@ -630,9 +632,17 @@ impl Endpoint { } /// Generate a JWT with the correct claims. - pub fn generate_jwt(&self) -> Result { + pub fn generate_jwt(&self, scope: Option) -> Result { self.env.generate_auth_token(&ComputeClaims { - compute_id: self.endpoint_id.clone(), + audience: match scope { + Some(ComputeClaimsScope::Admin) => Some(COMPUTE_AUDIENCE.to_owned()), + _ => Some(self.endpoint_id.clone()), + }, + compute_id: match scope { + Some(ComputeClaimsScope::Admin) => None, + _ => Some(self.endpoint_id.clone()), + }, + scope, }) } @@ -903,7 +913,7 @@ impl Endpoint { self.external_http_address.port() ), ) - .bearer_auth(self.generate_jwt()?) + .bearer_auth(self.generate_jwt(None::)?) .send() .await?; @@ -980,7 +990,7 @@ impl Endpoint { self.external_http_address.port() )) .header(CONTENT_TYPE.as_str(), "application/json") - .bearer_auth(self.generate_jwt()?) + .bearer_auth(self.generate_jwt(None::)?) .body( serde_json::to_string(&ConfigurationRequest { spec, diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 98f2fc297c..40d34eccea 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,16 +1,55 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. +use std::str::FromStr; + use serde::{Deserialize, Serialize}; use crate::privilege::Privilege; use crate::responses::ComputeCtlConfig; use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; +/// The value to place in the [`ComputeClaims::audience`] claim. +pub static COMPUTE_AUDIENCE: &str = "compute"; + +#[derive(Copy, Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +/// Available scopes for a compute's JWT. +pub enum ComputeClaimsScope { + /// An admin-scoped token allows access to all of `compute_ctl`'s authorized + /// facilities. + Admin, +} + +impl FromStr for ComputeClaimsScope { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "admin" => Ok(ComputeClaimsScope::Admin), + _ => Err(anyhow::anyhow!("invalid compute claims scope \"{s}\"")), + } + } +} + /// When making requests to the `compute_ctl` external HTTP server, the client /// must specify a set of claims in `Authorization` header JWTs such that /// `compute_ctl` can authorize the request. #[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename = "snake_case")] pub struct ComputeClaims { - pub compute_id: String, + /// The compute ID that will validate the token. The only case in which this + /// can be [`None`] is if [`Self::scope`] is + /// [`ComputeClaimsScope::Admin`]. + pub compute_id: Option, + + /// The scope of what the token authorizes. + pub scope: Option, + + /// The recipient the token is intended for. + /// + /// See [RFC 7519](https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3) for + /// more information. + #[serde(rename = "aud")] + pub audience: Option, } /// Request of the /configure API diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 652c38f5c3..beed1dcd93 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -1,6 +1,7 @@ from __future__ import annotations import urllib.parse +from enum import StrEnum from typing import TYPE_CHECKING, final import requests @@ -14,6 +15,17 @@ if TYPE_CHECKING: from requests import PreparedRequest +COMPUTE_AUDIENCE = "compute" +""" +The value to place in the `aud` claim. +""" + + +@final +class ComputeClaimsScope(StrEnum): + ADMIN = "admin" + + @final class BearerAuth(AuthBase): """ diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index b5d69b5ab6..3be78719d7 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: Any, ) + from fixtures.endpoint.http import ComputeClaimsScope from fixtures.pg_version import PgVersion @@ -535,12 +536,16 @@ class NeonLocalCli(AbstractNeonCli): res.check_returncode() return res - def endpoint_generate_jwt(self, endpoint_id: str) -> str: + def endpoint_generate_jwt( + self, endpoint_id: str, scope: ComputeClaimsScope | None = None + ) -> str: """ Generate a JWT for making requests to the endpoint's external HTTP server. """ args = ["endpoint", "generate-jwt", endpoint_id] + if scope: + args += ["--scope", str(scope)] cmd = self.raw_cli(args) cmd.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 47d1228c61..133be5c045 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -51,7 +51,7 @@ from fixtures.common_types import ( TimelineId, ) from fixtures.compute_migrations import NUM_COMPUTE_MIGRATIONS -from fixtures.endpoint.http import EndpointHttpClient +from fixtures.endpoint.http import ComputeClaimsScope, EndpointHttpClient from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.neon_cli import NeonLocalCli, Pagectl @@ -4218,7 +4218,7 @@ class Endpoint(PgProtocol, LogUtils): self.config(config_lines) - self.__jwt = self.env.neon_cli.endpoint_generate_jwt(self.endpoint_id) + self.__jwt = self.generate_jwt() return self @@ -4265,6 +4265,14 @@ class Endpoint(PgProtocol, LogUtils): return self + def generate_jwt(self, scope: ComputeClaimsScope | None = None) -> str: + """ + Generate a JWT for making requests to the endpoint's external HTTP + server. + """ + assert self.endpoint_id is not None + return self.env.neon_cli.endpoint_generate_jwt(self.endpoint_id, scope) + def endpoint_path(self) -> Path: """Path to endpoint directory""" assert self.endpoint_id diff --git a/test_runner/regress/test_compute_http.py b/test_runner/regress/test_compute_http.py new file mode 100644 index 0000000000..ce31ff0fe6 --- /dev/null +++ b/test_runner/regress/test_compute_http.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from http.client import FORBIDDEN, UNAUTHORIZED +from typing import TYPE_CHECKING + +import jwt +import pytest +from fixtures.endpoint.http import COMPUTE_AUDIENCE, ComputeClaimsScope, EndpointHttpClient +from fixtures.utils import run_only_on_default_postgres +from requests import RequestException + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + + +@run_only_on_default_postgres("The code path being tested is not dependent on Postgres version") +def test_compute_no_scope_claim(neon_simple_env: NeonEnv): + """ + Test that if the JWT scope is not admin and no compute_id is specified, + the external HTTP server returns a 403 Forbidden error. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + # Encode nothing in the token + token = jwt.encode({}, env.auth_keys.priv, algorithm="EdDSA") + + # Create an admin-scoped HTTP client + client = EndpointHttpClient( + external_port=endpoint.external_http_port, + internal_port=endpoint.internal_http_port, + jwt=token, + ) + + try: + client.status() + pytest.fail("Exception should have been raised") + except RequestException as e: + assert e.response is not None + assert e.response.status_code == FORBIDDEN + + +@pytest.mark.parametrize( + "audience", + (COMPUTE_AUDIENCE, "invalid", None), + ids=["with_audience", "with_invalid_audience", "without_audience"], +) +@run_only_on_default_postgres("The code path being tested is not dependent on Postgres version") +def test_compute_admin_scope_claim(neon_simple_env: NeonEnv, audience: str | None): + """ + Test that an admin-scoped JWT can access the compute's external HTTP server + without the compute_id being specified in the claims. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + data = {"scope": str(ComputeClaimsScope.ADMIN)} + if audience: + data["aud"] = audience + + token = jwt.encode(data, env.auth_keys.priv, algorithm="EdDSA") + + # Create an admin-scoped HTTP client + client = EndpointHttpClient( + external_port=endpoint.external_http_port, + internal_port=endpoint.internal_http_port, + jwt=token, + ) + + try: + client.status() + if audience != COMPUTE_AUDIENCE: + pytest.fail("Exception should have been raised") + except RequestException as e: + assert e.response is not None + assert e.response.status_code == UNAUTHORIZED From 384e3df2ad2c1561cc9d427793b2d54eaa3f137b Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Tue, 6 May 2025 17:52:15 -0400 Subject: [PATCH 35/40] fix: pinned anon extension to v2.1.0 (#11844) ## Problem Currently the setup for `anon` v2 in the compute image downloads the latest version of the extension. This can be problematic as on a compute start/restart it can download a version that is newer than what we have tested and potentially break things, hence not giving us the ability to control when the extension is updated. We were also using `v2.2.0`, which is not ready for production yet and has been clarified by the maintainer. Additional context: https://gitlab.com/dalibo/postgresql_anonymizer/-/issues/530 ## Summary of changes Changed the URL from which we download the `anon` extension to point to `v2.1.0` instead of `latest`. --- compute/compute-node.Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 8bdf5cb7d1..6233eaf709 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1351,7 +1351,8 @@ COPY compute/patches/anon_v2.patch . # This is an experimental extension, never got to real production. # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/latest/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \ +RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/2.1.0/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \ + echo "48e7f5ae2f1ca516df3da86c5c739d48dd780a4e885705704ccaad0faa89d6c0 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \ sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ From 5c356c63ebefa448ea521565df35d59f1c4a933b Mon Sep 17 00:00:00 2001 From: Mikhail Date: Tue, 6 May 2025 23:02:12 +0100 Subject: [PATCH 36/40] endpoint_storage compute_ctl integration (#11550) Add `/lfc/(prewarm|offload)` routes to `compute_ctl` which interact with endpoint storage. Add `prewarm_lfc_on_startup` spec option which, if enabled, downloads LFC prewarm data on compute startup. Resolves: https://github.com/neondatabase/cloud/issues/26343 --- Cargo.lock | 2 + Cargo.toml | 1 + compute_tools/Cargo.toml | 1 + compute_tools/src/compute.rs | 49 ++++- compute_tools/src/compute_prewarm.rs | 202 +++++++++++++++++++ compute_tools/src/http/routes/lfc.rs | 39 ++++ compute_tools/src/http/routes/mod.rs | 1 + compute_tools/src/http/server.rs | 4 +- compute_tools/src/lib.rs | 1 + compute_tools/src/metrics.rs | 22 +- compute_tools/tests/pg_helpers_tests.rs | 1 + control_plane/Cargo.toml | 2 +- control_plane/src/bin/neon_local.rs | 19 +- control_plane/src/endpoint.rs | 5 + control_plane/src/endpoint_storage.rs | 10 +- control_plane/src/local_env.rs | 16 +- endpoint_storage/src/app.rs | 12 +- endpoint_storage/src/claims.rs | 52 +++++ endpoint_storage/src/lib.rs | 85 +++----- libs/compute_api/src/responses.rs | 24 +++ libs/compute_api/src/spec.rs | 9 + libs/compute_api/tests/cluster_spec.json | 5 + libs/utils/src/id.rs | 3 + test_runner/fixtures/endpoint/http.py | 30 +++ test_runner/fixtures/neon_fixtures.py | 4 +- test_runner/regress/test_endpoint_storage.py | 3 +- test_runner/regress/test_lfc_prewarm.py | 166 ++++++++++----- 27 files changed, 630 insertions(+), 138 deletions(-) create mode 100644 compute_tools/src/compute_prewarm.rs create mode 100644 compute_tools/src/http/routes/lfc.rs create mode 100644 endpoint_storage/src/claims.rs diff --git a/Cargo.lock b/Cargo.lock index 4c464c62b8..fe4cc35029 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1284,6 +1284,7 @@ name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", + "async-compression", "aws-config", "aws-sdk-kms", "aws-sdk-s3", @@ -1420,6 +1421,7 @@ dependencies = [ "clap", "comfy-table", "compute_api", + "endpoint_storage", "futures", "http-utils", "humantime", diff --git a/Cargo.toml b/Cargo.toml index 1c203af9e0..8d4cc4a75a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -243,6 +243,7 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } +endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" } http-utils = { version = "0.1", path = "./libs/http-utils/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver = { path = "./pageserver" } diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8c1e7ad149..8ee5dd0665 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -10,6 +10,7 @@ default = [] testing = ["fail/failpoints"] [dependencies] +async-compression.workspace = true base64.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 8834f0d63d..08d915b331 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,17 +1,10 @@ -use std::collections::HashMap; -use std::os::unix::fs::{PermissionsExt, symlink}; -use std::path::Path; -use std::process::{Command, Stdio}; -use std::str::FromStr; -use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::{Arc, Condvar, Mutex, RwLock}; -use std::time::{Duration, Instant}; -use std::{env, fs}; - use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; -use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus}; +use compute_api::responses::{ + ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState, + LfcPrewarmState, +}; use compute_api::spec::{ ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, }; @@ -25,6 +18,16 @@ use postgres; use postgres::NoTls; use postgres::error::SqlState; use remote_storage::{DownloadError, RemotePath}; +use std::collections::HashMap; +use std::net::SocketAddr; +use std::os::unix::fs::{PermissionsExt, symlink}; +use std::path::Path; +use std::process::{Command, Stdio}; +use std::str::FromStr; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, Condvar, Mutex, RwLock}; +use std::time::{Duration, Instant}; +use std::{env, fs}; use tokio::spawn; use tracing::{Instrument, debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; @@ -150,6 +153,9 @@ pub struct ComputeState { /// set up the span relationship ourselves. pub startup_span: Option, + pub lfc_prewarm_state: LfcPrewarmState, + pub lfc_offload_state: LfcOffloadState, + pub metrics: ComputeMetrics, } @@ -163,6 +169,8 @@ impl ComputeState { pspec: None, startup_span: None, metrics: ComputeMetrics::default(), + lfc_prewarm_state: LfcPrewarmState::default(), + lfc_offload_state: LfcOffloadState::default(), } } @@ -198,6 +206,8 @@ pub struct ParsedSpec { pub pageserver_connstr: String, pub safekeeper_connstrings: Vec, pub storage_auth_token: Option, + pub endpoint_storage_addr: Option, + pub endpoint_storage_token: Option, } impl TryFrom for ParsedSpec { @@ -251,6 +261,18 @@ impl TryFrom for ParsedSpec { .or(Err("invalid timeline id"))? }; + let endpoint_storage_addr: Option = spec + .endpoint_storage_addr + .clone() + .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr")) + .unwrap_or_default() + .parse() + .ok(); + let endpoint_storage_token = spec + .endpoint_storage_token + .clone() + .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_token")); + Ok(ParsedSpec { spec, pageserver_connstr, @@ -258,6 +280,8 @@ impl TryFrom for ParsedSpec { storage_auth_token, tenant_id, timeline_id, + endpoint_storage_addr, + endpoint_storage_token, }) } } @@ -736,6 +760,9 @@ impl ComputeNode { // Log metrics so that we can search for slow operations in logs info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished"); + if pspec.spec.prewarm_lfc_on_startup { + self.prewarm_lfc(); + } Ok(()) } diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs new file mode 100644 index 0000000000..a6a84b3f1f --- /dev/null +++ b/compute_tools/src/compute_prewarm.rs @@ -0,0 +1,202 @@ +use crate::compute::ComputeNode; +use anyhow::{Context, Result, bail}; +use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder}; +use compute_api::responses::LfcOffloadState; +use compute_api::responses::LfcPrewarmState; +use http::StatusCode; +use reqwest::Client; +use std::sync::Arc; +use tokio::{io::AsyncReadExt, spawn}; +use tracing::{error, info}; + +#[derive(serde::Serialize, Default)] +pub struct LfcPrewarmStateWithProgress { + #[serde(flatten)] + base: LfcPrewarmState, + total: i32, + prewarmed: i32, + skipped: i32, +} + +/// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks +struct EndpointStoragePair { + url: String, + token: String, +} + +const KEY: &str = "lfc_state"; +impl TryFrom<&crate::compute::ParsedSpec> for EndpointStoragePair { + type Error = anyhow::Error; + fn try_from(pspec: &crate::compute::ParsedSpec) -> Result { + let Some(ref endpoint_id) = pspec.spec.endpoint_id else { + bail!("pspec.endpoint_id missing") + }; + let Some(ref base_uri) = pspec.endpoint_storage_addr else { + bail!("pspec.endpoint_storage_addr missing") + }; + let tenant_id = pspec.tenant_id; + let timeline_id = pspec.timeline_id; + + let url = format!("http://{base_uri}/{tenant_id}/{timeline_id}/{endpoint_id}/{KEY}"); + let Some(ref token) = pspec.endpoint_storage_token else { + bail!("pspec.endpoint_storage_token missing") + }; + let token = token.clone(); + Ok(EndpointStoragePair { url, token }) + } +} + +impl ComputeNode { + // If prewarm failed, we want to get overall number of segments as well as done ones. + // However, this function should be reliable even if querying postgres failed. + pub async fn lfc_prewarm_state(&self) -> LfcPrewarmStateWithProgress { + info!("requesting LFC prewarm state from postgres"); + let mut state = LfcPrewarmStateWithProgress::default(); + { + state.base = self.state.lock().unwrap().lfc_prewarm_state.clone(); + } + + let client = match ComputeNode::get_maintenance_client(&self.tokio_conn_conf).await { + Ok(client) => client, + Err(err) => { + error!(%err, "connecting to postgres"); + return state; + } + }; + let row = match client + .query_one("select * from get_prewarm_info()", &[]) + .await + { + Ok(row) => row, + Err(err) => { + error!(%err, "querying LFC prewarm status"); + return state; + } + }; + state.total = row.try_get(0).unwrap_or_default(); + state.prewarmed = row.try_get(1).unwrap_or_default(); + state.skipped = row.try_get(2).unwrap_or_default(); + state + } + + pub fn lfc_offload_state(&self) -> LfcOffloadState { + self.state.lock().unwrap().lfc_offload_state.clone() + } + + /// Returns false if there is a prewarm request ongoing, true otherwise + pub fn prewarm_lfc(self: &Arc) -> bool { + crate::metrics::LFC_PREWARM_REQUESTS.inc(); + { + let state = &mut self.state.lock().unwrap().lfc_prewarm_state; + if let LfcPrewarmState::Prewarming = + std::mem::replace(state, LfcPrewarmState::Prewarming) + { + return false; + } + } + + let cloned = self.clone(); + spawn(async move { + let Err(err) = cloned.prewarm_impl().await else { + cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed; + return; + }; + error!(%err); + cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed { + error: err.to_string(), + }; + }); + true + } + + fn endpoint_storage_pair(&self) -> Result { + let state = self.state.lock().unwrap(); + state.pspec.as_ref().unwrap().try_into() + } + + async fn prewarm_impl(&self) -> Result<()> { + let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?; + info!(%url, "requesting LFC state from endpoint storage"); + + let request = Client::new().get(&url).bearer_auth(token); + let res = request.send().await.context("querying endpoint storage")?; + let status = res.status(); + if status != StatusCode::OK { + bail!("{status} querying endpoint storage") + } + + let mut uncompressed = Vec::new(); + let lfc_state = res + .bytes() + .await + .context("getting request body from endpoint storage")?; + ZstdDecoder::new(lfc_state.iter().as_slice()) + .read_to_end(&mut uncompressed) + .await + .context("decoding LFC state")?; + let uncompressed_len = uncompressed.len(); + info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into postgres"); + + ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")? + .query_one("select prewarm_local_cache($1)", &[&uncompressed]) + .await + .context("loading LFC state into postgres") + .map(|_| ()) + } + + /// Returns false if there is an offload request ongoing, true otherwise + pub fn offload_lfc(self: &Arc) -> bool { + crate::metrics::LFC_OFFLOAD_REQUESTS.inc(); + { + let state = &mut self.state.lock().unwrap().lfc_offload_state; + if let LfcOffloadState::Offloading = + std::mem::replace(state, LfcOffloadState::Offloading) + { + return false; + } + } + + let cloned = self.clone(); + spawn(async move { + let Err(err) = cloned.offload_lfc_impl().await else { + cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; + return; + }; + error!(%err); + cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { + error: err.to_string(), + }; + }); + true + } + + async fn offload_lfc_impl(&self) -> Result<()> { + let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?; + info!(%url, "requesting LFC state from postgres"); + + let mut compressed = Vec::new(); + ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")? + .query_one("select get_local_cache_state()", &[]) + .await + .context("querying LFC state")? + .try_get::(0) + .context("deserializing LFC state") + .map(ZstdEncoder::new)? + .read_to_end(&mut compressed) + .await + .context("compressing LFC state")?; + let compressed_len = compressed.len(); + info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage"); + + let request = Client::new().put(url).bearer_auth(token).body(compressed); + match request.send().await { + Ok(res) if res.status() == StatusCode::OK => Ok(()), + Ok(res) => bail!("Error writing to endpoint storage: {}", res.status()), + Err(err) => Err(err).context("writing to endpoint storage"), + } + } +} diff --git a/compute_tools/src/http/routes/lfc.rs b/compute_tools/src/http/routes/lfc.rs new file mode 100644 index 0000000000..07bcc6bfb7 --- /dev/null +++ b/compute_tools/src/http/routes/lfc.rs @@ -0,0 +1,39 @@ +use crate::compute_prewarm::LfcPrewarmStateWithProgress; +use crate::http::JsonResponse; +use axum::response::{IntoResponse, Response}; +use axum::{Json, http::StatusCode}; +use compute_api::responses::LfcOffloadState; +type Compute = axum::extract::State>; + +pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json { + Json(compute.lfc_prewarm_state().await) +} + +// Following functions are marked async for axum, as it's more convenient than wrapping these +// in async lambdas at call site + +pub(in crate::http) async fn offload_state(compute: Compute) -> Json { + Json(compute.lfc_offload_state()) +} + +pub(in crate::http) async fn prewarm(compute: Compute) -> Response { + if compute.prewarm_lfc() { + StatusCode::ACCEPTED.into_response() + } else { + JsonResponse::error( + StatusCode::TOO_MANY_REQUESTS, + "Multiple requests for prewarm are not allowed", + ) + } +} + +pub(in crate::http) async fn offload(compute: Compute) -> Response { + if compute.offload_lfc() { + StatusCode::ACCEPTED.into_response() + } else { + JsonResponse::error( + StatusCode::TOO_MANY_REQUESTS, + "Multiple requests for prewarm offload are not allowed", + ) + } +} diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs index a67be7fd5a..432e66a830 100644 --- a/compute_tools/src/http/routes/mod.rs +++ b/compute_tools/src/http/routes/mod.rs @@ -11,6 +11,7 @@ pub(in crate::http) mod extensions; pub(in crate::http) mod failpoints; pub(in crate::http) mod grants; pub(in crate::http) mod insights; +pub(in crate::http) mod lfc; pub(in crate::http) mod metrics; pub(in crate::http) mod metrics_json; pub(in crate::http) mod status; diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 10f767e97c..d5d2427971 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -23,7 +23,7 @@ use super::{ middleware::authorize::Authorize, routes::{ check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, - grants, insights, metrics, metrics_json, status, terminate, + grants, insights, lfc, metrics, metrics_json, status, terminate, }, }; use crate::compute::ComputeNode; @@ -85,6 +85,8 @@ impl From<&Server> for Router> { Router::>::new().route("/metrics", get(metrics::get_metrics)); let authenticated_router = Router::>::new() + .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) + .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) .route("/database_schema", get(database_schema::get_schema_dump)) diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index a681fad0b0..7218067a8a 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -11,6 +11,7 @@ pub mod http; pub mod logger; pub mod catalog; pub mod compute; +pub mod compute_prewarm; pub mod disk_quota; pub mod extension_server; pub mod installed_extensions; diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index e37d6120ac..90326b2074 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,7 +1,7 @@ use metrics::core::{AtomicF64, AtomicU64, Collector, GenericCounter, GenericGauge}; use metrics::proto::MetricFamily; use metrics::{ - IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter, + IntCounter, IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter, register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec, }; use once_cell::sync::Lazy; @@ -97,6 +97,24 @@ pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy> = Lazy:: .expect("failed to define a metric") }); +/// Needed as neon.file_cache_prewarm_batch == 0 doesn't mean we never tried to prewarm. +/// On the other hand, LFC_PREWARMED_PAGES is excessive as we can GET /lfc/prewarm +pub(crate) static LFC_PREWARM_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter!( + "compute_ctl_lfc_prewarm_requests_total", + "Total number of LFC prewarm requests made by compute_ctl", + ) + .expect("failed to define a metric") +}); + +pub(crate) static LFC_OFFLOAD_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter!( + "compute_ctl_lfc_offload_requests_total", + "Total number of LFC offload requests made by compute_ctl", + ) + .expect("failed to define a metric") +}); + pub fn collect() -> Vec { let mut metrics = COMPUTE_CTL_UP.collect(); metrics.extend(INSTALLED_EXTENSIONS.collect()); @@ -106,5 +124,7 @@ pub fn collect() -> Vec { metrics.extend(AUDIT_LOG_DIR_SIZE.collect()); metrics.extend(PG_CURR_DOWNTIME_MS.collect()); metrics.extend(PG_TOTAL_DOWNTIME_MS.collect()); + metrics.extend(LFC_PREWARM_REQUESTS.collect()); + metrics.extend(LFC_OFFLOAD_REQUESTS.collect()); metrics } diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index b72c1293ee..53f2ddad84 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -30,6 +30,7 @@ mod pg_helpers_tests { r#"fsync = off wal_level = logical hot_standby = on +prewarm_lfc_on_startup = off neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501' wal_log_hints = on log_connections = on diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 92f0071bac..62c039047f 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -41,7 +41,7 @@ storage_broker.workspace = true http-utils.workspace = true utils.workspace = true whoami.workspace = true - +endpoint_storage.workspace = true compute_api.workspace = true workspace_hack.workspace = true tracing.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 44698f7b23..fd625e9ed6 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -20,7 +20,7 @@ use compute_api::requests::ComputeClaimsScope; use compute_api::spec::ComputeMode; use control_plane::broker::StorageBroker; use control_plane::endpoint::ComputeControlPlane; -use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_PORT, EndpointStorage}; +use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::local_env; use control_plane::local_env::{ EndpointStorageConf, InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, @@ -1022,7 +1022,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { }) .collect(), endpoint_storage: EndpointStorageConf { - port: ENDPOINT_STORAGE_DEFAULT_PORT, + listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR, }, pg_distrib_dir: None, neon_distrib_dir: None, @@ -1488,10 +1488,25 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res None }; + let exp = (std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)? + + Duration::from_secs(86400)) + .as_secs(); + let claims = endpoint_storage::claims::EndpointStorageClaims { + tenant_id: endpoint.tenant_id, + timeline_id: endpoint.timeline_id, + endpoint_id: endpoint_id.to_string(), + exp, + }; + + let endpoint_storage_token = env.generate_auth_token(&claims)?; + let endpoint_storage_addr = env.endpoint_storage.listen_addr.to_string(); + println!("Starting existing endpoint {endpoint_id}..."); endpoint .start( &auth_token, + endpoint_storage_token, + endpoint_storage_addr, safekeepers_generation, safekeepers, pageservers, diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 0b16339a6f..fe6a93eb5e 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -650,6 +650,8 @@ impl Endpoint { pub async fn start( &self, auth_token: &Option, + endpoint_storage_token: String, + endpoint_storage_addr: String, safekeepers_generation: Option, safekeepers: Vec, pageservers: Vec<(Host, u16)>, @@ -743,6 +745,9 @@ impl Endpoint { drop_subscriptions_before_start: self.drop_subscriptions_before_start, audit_log_level: ComputeAudit::Disabled, logs_export_host: None::, + endpoint_storage_addr: Some(endpoint_storage_addr), + endpoint_storage_token: Some(endpoint_storage_token), + prewarm_lfc_on_startup: false, }; // this strange code is needed to support respec() in tests diff --git a/control_plane/src/endpoint_storage.rs b/control_plane/src/endpoint_storage.rs index 102db91a22..171aaeddb4 100644 --- a/control_plane/src/endpoint_storage.rs +++ b/control_plane/src/endpoint_storage.rs @@ -3,17 +3,19 @@ use crate::local_env::LocalEnv; use anyhow::{Context, Result}; use camino::Utf8PathBuf; use std::io::Write; +use std::net::SocketAddr; use std::time::Duration; /// Directory within .neon which will be used by default for LocalFs remote storage. pub const ENDPOINT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/endpoint_storage"; -pub const ENDPOINT_STORAGE_DEFAULT_PORT: u16 = 9993; +pub const ENDPOINT_STORAGE_DEFAULT_ADDR: SocketAddr = + SocketAddr::new(std::net::IpAddr::V4(std::net::Ipv4Addr::LOCALHOST), 9993); pub struct EndpointStorage { pub bin: Utf8PathBuf, pub data_dir: Utf8PathBuf, pub pemfile: Utf8PathBuf, - pub port: u16, + pub addr: SocketAddr, } impl EndpointStorage { @@ -22,7 +24,7 @@ impl EndpointStorage { bin: Utf8PathBuf::from_path_buf(env.endpoint_storage_bin()).unwrap(), data_dir: Utf8PathBuf::from_path_buf(env.endpoint_storage_data_dir()).unwrap(), pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(), - port: env.endpoint_storage.port, + addr: env.endpoint_storage.listen_addr, } } @@ -31,7 +33,7 @@ impl EndpointStorage { } fn listen_addr(&self) -> Utf8PathBuf { - format!("127.0.0.1:{}", self.port).into() + format!("{}:{}", self.addr.ip(), self.addr.port()).into() } pub fn init(&self) -> Result<()> { diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index a18b34daa4..4a8892c6de 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -20,7 +20,9 @@ use utils::auth::encode_from_key_file; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use crate::broker::StorageBroker; -use crate::endpoint_storage::{ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage}; +use crate::endpoint_storage::{ + ENDPOINT_STORAGE_DEFAULT_ADDR, ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage, +}; use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode}; use crate::safekeeper::SafekeeperNode; @@ -151,10 +153,10 @@ pub struct NeonLocalInitConf { pub generate_local_ssl_certs: bool, } -#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct EndpointStorageConf { - pub port: u16, + pub listen_addr: SocketAddr, } /// Broker config for cluster internal communication. @@ -241,6 +243,14 @@ impl Default for NeonStorageControllerConf { } } +impl Default for EndpointStorageConf { + fn default() -> Self { + Self { + listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR, + } + } +} + impl NeonBroker { pub fn client_url(&self) -> Url { let url = if let Some(addr) = self.listen_https_addr { diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index f07ef06328..0bd7fe5f28 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -343,7 +343,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]); const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg"; fn token() -> String { - let claims = endpoint_storage::Claims { + let claims = endpoint_storage::claims::EndpointStorageClaims { tenant_id: TENANT_ID, timeline_id: TIMELINE_ID, endpoint_id: ENDPOINT_ID.into(), @@ -489,16 +489,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH } fn delete_prefix_token(uri: &str) -> String { - use serde::Serialize; let parts = uri.split("/").collect::>(); - #[derive(Serialize)] - struct PrefixClaims { - tenant_id: TenantId, - timeline_id: Option, - endpoint_id: Option, - exp: u64, - } - let claims = PrefixClaims { + let claims = endpoint_storage::claims::DeletePrefixClaims { tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(), timeline_id: parts.get(2).map(|c| c.parse().unwrap()), endpoint_id: parts.get(3).map(ToString::to_string), diff --git a/endpoint_storage/src/claims.rs b/endpoint_storage/src/claims.rs new file mode 100644 index 0000000000..ef0f0eb0b4 --- /dev/null +++ b/endpoint_storage/src/claims.rs @@ -0,0 +1,52 @@ +use serde::{Deserialize, Serialize}; +use std::fmt::Display; +use utils::id::{EndpointId, TenantId, TimelineId}; + +/// Claims to add, remove, or retrieve endpoint data. Used by compute_ctl +#[derive(Deserialize, Serialize, PartialEq)] +pub struct EndpointStorageClaims { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub endpoint_id: EndpointId, + pub exp: u64, +} + +/// Claims to remove tenant, timeline, or endpoint data. Used by control plane +#[derive(Deserialize, Serialize, PartialEq)] +pub struct DeletePrefixClaims { + pub tenant_id: TenantId, + /// None when tenant is deleted (endpoint_id is also None in this case) + pub timeline_id: Option, + /// None when timeline is deleted + pub endpoint_id: Option, + pub exp: u64, +} + +impl Display for EndpointStorageClaims { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "EndpointClaims(tenant_id={} timeline_id={} endpoint_id={} exp={})", + self.tenant_id, self.timeline_id, self.endpoint_id, self.exp + ) + } +} + +impl Display for DeletePrefixClaims { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "DeletePrefixClaims(tenant_id={} timeline_id={} endpoint_id={}, exp={})", + self.tenant_id, + self.timeline_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()), + self.endpoint_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()), + self.exp + ) + } +} diff --git a/endpoint_storage/src/lib.rs b/endpoint_storage/src/lib.rs index eb6b80c487..d1625dc843 100644 --- a/endpoint_storage/src/lib.rs +++ b/endpoint_storage/src/lib.rs @@ -1,3 +1,5 @@ +pub mod claims; +use crate::claims::{DeletePrefixClaims, EndpointStorageClaims}; use anyhow::Result; use axum::extract::{FromRequestParts, Path}; use axum::response::{IntoResponse, Response}; @@ -13,7 +15,7 @@ use std::result::Result as StdResult; use std::sync::Arc; use tokio_util::sync::CancellationToken; use tracing::{debug, error}; -use utils::id::{TenantId, TimelineId}; +use utils::id::{EndpointId, TenantId, TimelineId}; // simplified version of utils::auth::JwtAuth pub struct JwtAuth { @@ -79,26 +81,6 @@ pub struct Storage { pub max_upload_file_limit: usize, } -pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc - -#[derive(Deserialize, Serialize, PartialEq)] -pub struct Claims { - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub endpoint_id: EndpointId, - pub exp: u64, -} - -impl Display for Claims { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})", - self.tenant_id, self.timeline_id, self.endpoint_id, self.exp - ) - } -} - #[derive(Deserialize, Serialize)] struct KeyRequest { tenant_id: TenantId, @@ -107,6 +89,13 @@ struct KeyRequest { path: String, } +#[derive(Deserialize, Serialize, PartialEq)] +struct PrefixKeyRequest { + tenant_id: TenantId, + timeline_id: Option, + endpoint_id: Option, +} + #[derive(Debug, PartialEq)] pub struct S3Path { pub path: RemotePath, @@ -165,7 +154,7 @@ impl FromRequestParts> for S3Path { .extract::>>() .await .map_err(|e| bad_request(e, "invalid token"))?; - let claims: Claims = state + let claims: EndpointStorageClaims = state .auth .decode(bearer.token()) .map_err(|e| bad_request(e, "decoding token"))?; @@ -178,7 +167,7 @@ impl FromRequestParts> for S3Path { path.endpoint_id.clone() }; - let route = Claims { + let route = EndpointStorageClaims { tenant_id: path.tenant_id, timeline_id: path.timeline_id, endpoint_id, @@ -193,38 +182,13 @@ impl FromRequestParts> for S3Path { } } -#[derive(Deserialize, Serialize, PartialEq)] -pub struct PrefixKeyPath { - pub tenant_id: TenantId, - pub timeline_id: Option, - pub endpoint_id: Option, -} - -impl Display for PrefixKeyPath { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})", - self.tenant_id, - self.timeline_id - .as_ref() - .map(ToString::to_string) - .unwrap_or("".to_string()), - self.endpoint_id - .as_ref() - .map(ToString::to_string) - .unwrap_or("".to_string()) - ) - } -} - #[derive(Debug, PartialEq)] pub struct PrefixS3Path { pub path: RemotePath, } -impl From<&PrefixKeyPath> for PrefixS3Path { - fn from(path: &PrefixKeyPath) -> Self { +impl From<&DeletePrefixClaims> for PrefixS3Path { + fn from(path: &DeletePrefixClaims) -> Self { let timeline_id = path .timeline_id .as_ref() @@ -250,21 +214,27 @@ impl FromRequestParts> for PrefixS3Path { state: &Arc, ) -> Result { let Path(path) = parts - .extract::>() + .extract::>() .await .map_err(|e| bad_request(e, "invalid route"))?; let TypedHeader(Authorization(bearer)) = parts .extract::>>() .await .map_err(|e| bad_request(e, "invalid token"))?; - let claims: PrefixKeyPath = state + let claims: DeletePrefixClaims = state .auth .decode(bearer.token()) .map_err(|e| bad_request(e, "invalid token"))?; - if path != claims { - return Err(unauthorized(path, claims)); + let route = DeletePrefixClaims { + tenant_id: path.tenant_id, + timeline_id: path.timeline_id, + endpoint_id: path.endpoint_id, + exp: claims.exp, + }; + if route != claims { + return Err(unauthorized(route, claims)); } - Ok((&path).into()) + Ok((&route).into()) } } @@ -297,7 +267,7 @@ mod tests { #[test] fn s3_path() { - let auth = Claims { + let auth = EndpointStorageClaims { tenant_id: TENANT_ID, timeline_id: TIMELINE_ID, endpoint_id: ENDPOINT_ID.into(), @@ -327,10 +297,11 @@ mod tests { #[test] fn prefix_s3_path() { - let mut path = PrefixKeyPath { + let mut path = DeletePrefixClaims { tenant_id: TENANT_ID, timeline_id: None, endpoint_id: None, + exp: 0, }; let prefix_path = |s: String| RemotePath::from_string(&s).unwrap(); assert_eq!( diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index b7d6b7ca34..24d371c6eb 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -46,6 +46,30 @@ pub struct ExtensionInstallResponse { pub version: ExtVersion, } +#[derive(Serialize, Default, Debug, Clone)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum LfcPrewarmState { + #[default] + NotPrewarmed, + Prewarming, + Completed, + Failed { + error: String, + }, +} + +#[derive(Serialize, Default, Debug, Clone)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum LfcOffloadState { + #[default] + NotOffloaded, + Offloading, + Completed, + Failed { + error: String, + }, +} + /// Response of the /status API #[derive(Serialize, Debug, Deserialize)] #[serde(rename_all = "snake_case")] diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index ad246c48ec..09b550b96c 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -172,6 +172,15 @@ pub struct ComputeSpec { /// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding. /// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514 pub logs_export_host: Option, + + /// Address of endpoint storage service + pub endpoint_storage_addr: Option, + /// JWT for authorizing requests to endpoint storage service + pub endpoint_storage_token: Option, + + /// If true, download LFC state from endpoint_storage and pass it to Postgres on startup + #[serde(default)] + pub prewarm_lfc_on_startup: bool, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index 37de24be5b..30e788a601 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -84,6 +84,11 @@ "value": "on", "vartype": "bool" }, + { + "name": "prewarm_lfc_on_startup", + "value": "off", + "vartype": "bool" + }, { "name": "neon.safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 6016c23a01..68cb1f0209 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -295,6 +295,9 @@ pub struct TenantId(Id); id_newtype!(TenantId); +/// If needed, reuse small string from proxy/src/types.rc +pub type EndpointId = String; + // A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct TenantTimelineId { diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index beed1dcd93..4b4b98aa6c 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -10,6 +10,7 @@ from requests.auth import AuthBase from typing_extensions import override from fixtures.log_helper import log +from fixtures.utils import wait_until if TYPE_CHECKING: from requests import PreparedRequest @@ -62,6 +63,35 @@ class EndpointHttpClient(requests.Session): res.raise_for_status() return res.json() + def prewarm_lfc_status(self) -> dict[str, str]: + res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm") + res.raise_for_status() + json: dict[str, str] = res.json() + return json + + def prewarm_lfc(self): + self.post(f"http://localhost:{self.external_port}/lfc/prewarm").raise_for_status() + + def prewarmed(): + json = self.prewarm_lfc_status() + status, err = json["status"], json.get("error") + assert status == "completed", f"{status}, error {err}" + + wait_until(prewarmed) + + def offload_lfc(self): + url = f"http://localhost:{self.external_port}/lfc/offload" + self.post(url).raise_for_status() + + def offloaded(): + res = self.get(url) + res.raise_for_status() + json = res.json() + status, err = json["status"], json.get("error") + assert status == "completed", f"{status}, error {err}" + + wait_until(offloaded) + def database_schema(self, database: str): res = self.get( f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 133be5c045..d4a750ad3b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1185,7 +1185,9 @@ class NeonEnv: "broker": {}, "safekeepers": [], "pageservers": [], - "endpoint_storage": {"port": self.port_distributor.get_port()}, + "endpoint_storage": { + "listen_addr": f"127.0.0.1:{self.port_distributor.get_port()}", + }, "generate_local_ssl_certs": self.generate_local_ssl_certs, } diff --git a/test_runner/regress/test_endpoint_storage.py b/test_runner/regress/test_endpoint_storage.py index 04029114ec..1e27ef4b14 100644 --- a/test_runner/regress/test_endpoint_storage.py +++ b/test_runner/regress/test_endpoint_storage.py @@ -4,10 +4,12 @@ import pytest from aiohttp import ClientSession from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import run_only_on_default_postgres from jwcrypto import jwk, jwt @pytest.mark.asyncio +@run_only_on_default_postgres("test doesn't use postgres") async def test_endpoint_storage_insert_retrieve_delete(neon_simple_env: NeonEnv): """ Inserts, retrieves, and deletes test file using a JWT token @@ -35,7 +37,6 @@ async def test_endpoint_storage_insert_retrieve_delete(neon_simple_env: NeonEnv) key = f"http://{base_url}/{tenant_id}/{timeline_id}/{endpoint_id}/key" headers = {"Authorization": f"Bearer {token}"} log.info(f"cache key url {key}") - log.info(f"token {token}") async with ClientSession(headers=headers) as session: async with session.get(key) as res: diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py index dd0ae1921d..82e1e9fcba 100644 --- a/test_runner/regress/test_lfc_prewarm.py +++ b/test_runner/regress/test_lfc_prewarm.py @@ -1,11 +1,24 @@ import random import threading import time +from enum import Enum import pytest +from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.utils import USE_LFC +from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl + + +class LfcQueryMethod(Enum): + COMPUTE_CTL = False + POSTGRES = True + + +PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total" +OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total" +QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL def check_pinned_entries(cur): @@ -19,11 +32,20 @@ def check_pinned_entries(cur): assert n_pinned == 0 +def prom_parse(client: EndpointHttpClient) -> dict[str, float]: + return { + sample.name: sample.value + for family in prom_parse_impl(client.metrics()) + for sample in family.samples + if sample.name in (PREWARM_LABEL, OFFLOAD_LABEL) + } + + @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") -def test_lfc_prewarm(neon_simple_env: NeonEnv): +@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"]) +def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod): env = neon_simple_env n_records = 1000000 - endpoint = env.endpoints.create_start( branch_name="main", config_lines=[ @@ -34,30 +56,57 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv): "neon.file_cache_prewarm_limit=1000", ], ) - conn = endpoint.connect() - cur = conn.cursor() - cur.execute("create extension neon version '1.6'") - cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))") - cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") - cur.execute("select get_local_cache_state()") - lfc_state = cur.fetchall()[0][0] + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create extension neon version '1.6'") + pg_cur.execute("create database lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + log.info(f"Inserting {n_records} rows") + lfc_cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))") + lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + http_client = endpoint.http_client() + if query is LfcQueryMethod.COMPUTE_CTL: + status = http_client.prewarm_lfc_status() + assert status["status"] == "not_prewarmed" + assert "error" not in status + http_client.offload_lfc() + assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed" + assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0} + else: + pg_cur.execute("select get_local_cache_state()") + lfc_state = pg_cur.fetchall()[0][0] endpoint.stop() endpoint.start() - conn = endpoint.connect() - cur = conn.cursor() - time.sleep(1) # wait until compute_ctl complete downgrade of extension to default version - cur.execute("alter extension neon update to '1.6'") - cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + # wait until compute_ctl completes downgrade of extension to default version + time.sleep(1) + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("alter extension neon update to '1.6'") - cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'") - lfc_used_pages = cur.fetchall()[0][0] + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.prewarm_lfc() + else: + pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + + pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'") + lfc_used_pages = pg_cur.fetchall()[0][0] log.info(f"Used LFC size: {lfc_used_pages}") - cur.execute("select * from get_prewarm_info()") - prewarm_info = cur.fetchall()[0] + pg_cur.execute("select * from get_prewarm_info()") + prewarm_info = pg_cur.fetchall()[0] log.info(f"Prewarm info: {prewarm_info}") - log.info(f"Prewarm progress: {(prewarm_info[1] + prewarm_info[2]) * 100 // prewarm_info[0]}%") + total, prewarmed, skipped, _ = prewarm_info + progress = (prewarmed + skipped) * 100 // total + log.info(f"Prewarm progress: {progress}%") assert lfc_used_pages > 10000 assert ( @@ -66,18 +115,23 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv): and prewarm_info[0] == prewarm_info[1] + prewarm_info[2] ) - cur.execute("select sum(pk) from t") - assert cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 + lfc_cur.execute("select sum(pk) from t") + assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 - check_pinned_entries(cur) + check_pinned_entries(pg_cur) + + desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped} + if query is LfcQueryMethod.COMPUTE_CTL: + assert http_client.prewarm_lfc_status() == desired + assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1} @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") -def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv): +@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"]) +def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod): env = neon_simple_env n_records = 10000 n_threads = 4 - endpoint = env.endpoints.create_start( branch_name="main", config_lines=[ @@ -87,40 +141,58 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv): "neon.file_cache_prewarm_limit=1000000", ], ) - conn = endpoint.connect() - cur = conn.cursor() - cur.execute("create extension neon version '1.6'") - cur.execute( + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create extension neon version '1.6'") + pg_cur.execute("CREATE DATABASE lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + lfc_cur.execute( "create table accounts(id integer primary key, balance bigint default 0, payload text default repeat('?', 1000)) with (fillfactor=10)" ) - cur.execute(f"insert into accounts(id) values (generate_series(1,{n_records}))") - cur.execute("select get_local_cache_state()") - lfc_state = cur.fetchall()[0][0] + log.info(f"Inserting {n_records} rows") + lfc_cur.execute(f"insert into accounts(id) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + http_client = endpoint.http_client() + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.offload_lfc() + else: + pg_cur.execute("select get_local_cache_state()") + lfc_state = pg_cur.fetchall()[0][0] running = True + n_prewarms = 0 def workload(): - conn = endpoint.connect() - cur = conn.cursor() + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() n_transfers = 0 while running: src = random.randint(1, n_records) dst = random.randint(1, n_records) - cur.execute("update accounts set balance=balance-100 where id=%s", (src,)) - cur.execute("update accounts set balance=balance+100 where id=%s", (dst,)) + lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,)) + lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,)) n_transfers += 1 log.info(f"Number of transfers: {n_transfers}") def prewarm(): - conn = endpoint.connect() - cur = conn.cursor() - n_prewarms = 0 + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() while running: - cur.execute("alter system set neon.file_cache_size_limit='1MB'") - cur.execute("select pg_reload_conf()") - cur.execute("alter system set neon.file_cache_size_limit='1GB'") - cur.execute("select pg_reload_conf()") - cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + pg_cur.execute("alter system set neon.file_cache_size_limit='1MB'") + pg_cur.execute("select pg_reload_conf()") + pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'") + pg_cur.execute("select pg_reload_conf()") + + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.prewarm_lfc() + else: + pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + + nonlocal n_prewarms n_prewarms += 1 log.info(f"Number of prewarms: {n_prewarms}") @@ -140,8 +212,10 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv): t.join() prewarm_thread.join() - cur.execute("select sum(balance) from accounts") - total_balance = cur.fetchall()[0][0] + lfc_cur.execute("select sum(balance) from accounts") + total_balance = lfc_cur.fetchall()[0][0] assert total_balance == 0 - check_pinned_entries(cur) + check_pinned_entries(pg_cur) + if query is LfcQueryMethod.COMPUTE_CTL: + assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms} From 0ef6851219c58d52263fd96cb87c18e72a928fd2 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 6 May 2025 17:19:15 -0500 Subject: [PATCH 37/40] Make the audience claim in compute JWTs a vector (#11845) According to RFC 7519, `aud` is generally an array of StringOrURI, but in special cases may be a single StringOrURI value. To accomodate future control plane work where a single token may work for multiple services, make the claim a vector. Link: https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3 Signed-off-by: Tristan Partin --- compute_tools/src/http/middleware/authorize.rs | 2 +- control_plane/src/endpoint.rs | 4 ++-- libs/compute_api/src/requests.rs | 7 +++++-- test_runner/regress/test_compute_http.py | 4 ++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs index 2afc57ad9c..a82f46e062 100644 --- a/compute_tools/src/http/middleware/authorize.rs +++ b/compute_tools/src/http/middleware/authorize.rs @@ -79,7 +79,7 @@ impl AsyncAuthorizeRequest for Authorize { )); }; - if audience != COMPUTE_AUDIENCE { + if !audience.iter().any(|a| a == COMPUTE_AUDIENCE) { return Err(JsonResponse::error( StatusCode::UNAUTHORIZED, "invalid audience in authorization token claims", diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index fe6a93eb5e..be73661a3c 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -635,8 +635,8 @@ impl Endpoint { pub fn generate_jwt(&self, scope: Option) -> Result { self.env.generate_auth_token(&ComputeClaims { audience: match scope { - Some(ComputeClaimsScope::Admin) => Some(COMPUTE_AUDIENCE.to_owned()), - _ => Some(self.endpoint_id.clone()), + Some(ComputeClaimsScope::Admin) => Some(vec![COMPUTE_AUDIENCE.to_owned()]), + _ => None, }, compute_id: match scope { Some(ComputeClaimsScope::Admin) => None, diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 40d34eccea..bbab271474 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -10,9 +10,9 @@ use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; /// The value to place in the [`ComputeClaims::audience`] claim. pub static COMPUTE_AUDIENCE: &str = "compute"; +/// Available scopes for a compute's JWT. #[derive(Copy, Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] #[serde(rename_all = "snake_case")] -/// Available scopes for a compute's JWT. pub enum ComputeClaimsScope { /// An admin-scoped token allows access to all of `compute_ctl`'s authorized /// facilities. @@ -48,8 +48,11 @@ pub struct ComputeClaims { /// /// See [RFC 7519](https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3) for /// more information. + /// + /// TODO: Remove the [`Option`] wrapper when control plane learns to send + /// the claim. #[serde(rename = "aud")] - pub audience: Option, + pub audience: Option>, } /// Request of the /configure API diff --git a/test_runner/regress/test_compute_http.py b/test_runner/regress/test_compute_http.py index ce31ff0fe6..9846d44ce2 100644 --- a/test_runner/regress/test_compute_http.py +++ b/test_runner/regress/test_compute_http.py @@ -56,9 +56,9 @@ def test_compute_admin_scope_claim(neon_simple_env: NeonEnv, audience: str | Non endpoint = env.endpoints.create_start("main") - data = {"scope": str(ComputeClaimsScope.ADMIN)} + data: dict[str, str | list[str]] = {"scope": str(ComputeClaimsScope.ADMIN)} if audience: - data["aud"] = audience + data["aud"] = [audience] token = jwt.encode(data, env.auth_keys.priv, algorithm="EdDSA") From 608afc3055fe1acb32caba2f7b94e01db30f12f2 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 7 May 2025 17:21:17 +0800 Subject: [PATCH 38/40] fix(scrubber): log download error (#11833) ## Problem We use `head_object` to determine whether an object exists or not. However, it does not always error due to a missing object. ## Summary of changes Log the error so that we can have a better idea what's going on with the scrubber errors in prod. --------- Signed-off-by: Alex Chi Z --- storage_scrubber/src/checks.rs | 5 +++-- storage_scrubber/src/pageserver_physical_gc.rs | 11 +++++++---- storage_scrubber/src/scan_pageserver_metadata.rs | 5 ++++- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index f0ba632fd4..b151b612bf 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -165,16 +165,17 @@ pub(crate) async fn branch_cleanup_and_check_errors( .head_object(&path, &CancellationToken::new()) .await; - if response.is_err() { + if let Err(e) = response { // Object is not present. let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); let msg = format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}", layer, metadata.generation.get_suffix(), metadata.shard, is_l0, + e, ); if is_l0 || ignore_error { diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index f14341c7bc..e1a4095a3c 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -137,11 +137,10 @@ struct TenantRefAccumulator { impl TenantRefAccumulator { fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) { let this_shard_idx = ttid.tenant_shard_id.to_index(); - (*self - .shards_seen + self.shards_seen .entry(ttid.tenant_shard_id.tenant_id) - .or_default()) - .insert(this_shard_idx); + .or_default() + .insert(this_shard_idx); let mut ancestor_refs = Vec::new(); for (layer_name, layer_metadata) in &index_part.layer_metadata { @@ -767,10 +766,13 @@ pub async fn pageserver_physical_gc( stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id).await?, ); Ok(try_stream! { + let mut cnt = 0; while let Some(ttid_res) = timelines.next().await { let ttid = ttid_res?; + cnt += 1; yield (ttid, tenant_manifest_arc.clone()); } + tracing::info!(%tenant_shard_id, "Found {} timelines", cnt); }) } }); @@ -790,6 +792,7 @@ pub async fn pageserver_physical_gc( &accumulator, tenant_manifest_arc, ) + .instrument(info_span!("gc_timeline", %ttid)) }); let timelines = timelines.try_buffered(CONCURRENCY); let mut timelines = std::pin::pin!(timelines); diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index ba75f25984..77c7987aa7 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -153,7 +153,10 @@ pub async fn scan_pageserver_metadata( const CONCURRENCY: usize = 32; // Generate a stream of TenantTimelineId - let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t)); + let timelines = tenants.map_ok(|t| { + tracing::info!("Found tenant: {}", t); + stream_tenant_timelines(&remote_client, &target, t) + }); let timelines = timelines.try_buffered(CONCURRENCY); let timelines = timelines.try_flatten(); From 3cf5e1386c3071994281b4bc7a1579e4595689f6 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 7 May 2025 11:13:26 +0100 Subject: [PATCH 39/40] pageserver: fix rough edges of pageserver tracing (#11842) ## Problem There's a few rough edges around PS tracing. ## Summary of changes * include compute request id in pageserver trace * use the get page specific context for GET_REL_SIZE and GET_BATCH * fix assertion in download layer trace ![image](https://github.com/user-attachments/assets/2ff6779c-7c2d-4102-8013-ada8203aa42f) --- pageserver/src/page_service.rs | 6 +- pageserver/src/pgdatadir_mapping.rs | 64 +++++++++++--------- pageserver/src/tenant/storage_layer/layer.rs | 35 +++++------ 3 files changed, 55 insertions(+), 50 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 0ce1a99681..bca1cb5b49 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1038,21 +1038,23 @@ impl PageServerHandler { tracing::info_span!( parent: &parent_span, "handle_get_page_request", + request_id = %req.hdr.reqid, rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, - not_modified_since_lsn = %req.hdr.not_modified_since + not_modified_since_lsn = %req.hdr.not_modified_since, ) }}; ($shard_id:expr) => {{ tracing::info_span!( parent: &parent_span, "handle_get_page_request", + request_id = %req.hdr.reqid, rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, not_modified_since_lsn = %req.hdr.not_modified_since, - shard_id = %$shard_id + shard_id = %$shard_id, ) }}; } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index ccb48d8bc1..d770946580 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -40,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; -use crate::context::{PerfInstrumentFutureExt, RequestContext}; +use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder}; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::metrics::{ RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD, @@ -275,24 +275,30 @@ impl Timeline { continue; } - let nblocks = match self - .get_rel_size(*tag, Version::Lsn(lsn), &ctx) - .maybe_perf_instrument(&ctx, |crnt_perf_span| { - info_span!( - target: PERF_TRACE_TARGET, - parent: crnt_perf_span, - "GET_REL_SIZE", - reltag=%tag, - lsn=%lsn, - ) - }) - .await - { - Ok(nblocks) => nblocks, - Err(err) => { - result_slots[response_slot_idx].write(Err(err)); - slots_filled += 1; - continue; + let nblocks = { + let ctx = RequestContextBuilder::from(&ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_REL_SIZE", + reltag=%tag, + lsn=%lsn, + ) + }) + .attached_child(); + + match self + .get_rel_size(*tag, Version::Lsn(lsn), &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .await + { + Ok(nblocks) => nblocks, + Err(err) => { + result_slots[response_slot_idx].write(Err(err)); + slots_filled += 1; + continue; + } } }; @@ -308,6 +314,17 @@ impl Timeline { let key = rel_block_to_key(*tag, *blknum); + let ctx = RequestContextBuilder::from(&ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_BATCH", + batch_size = %page_count, + ) + }) + .attached_child(); + let key_slots = keys_slots.entry(key).or_default(); key_slots.push((response_slot_idx, ctx)); @@ -323,14 +340,7 @@ impl Timeline { let query = VersionedKeySpaceQuery::scattered(query); let res = self .get_vectored(query, io_concurrency, ctx) - .maybe_perf_instrument(ctx, |current_perf_span| { - info_span!( - target: PERF_TRACE_TARGET, - parent: current_perf_span, - "GET_BATCH", - batch_size = %page_count, - ) - }) + .maybe_perf_instrument(ctx, |current_perf_span| current_perf_span.clone()) .await; match res { diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 50810cb154..3d55972017 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -23,7 +23,7 @@ use super::{ LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState, }; use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; +use crate::context::{RequestContext, RequestContextBuilder}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::task_mgr::TaskKind; use crate::tenant::Timeline; @@ -1076,24 +1076,17 @@ impl LayerInner { return Err(DownloadError::DownloadRequired); } - let ctx = if ctx.has_perf_span() { - let dl_ctx = RequestContextBuilder::from(ctx) - .task_kind(TaskKind::LayerDownload) - .download_behavior(DownloadBehavior::Download) - .root_perf_span(|| { - info_span!( - target: PERF_TRACE_TARGET, - "DOWNLOAD_LAYER", - layer = %self, - reason = %reason - ) - }) - .detached_child(); - ctx.perf_follows_from(&dl_ctx); - dl_ctx - } else { - ctx.attached_child() - }; + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "DOWNLOAD_LAYER", + layer = %self, + reason = %reason, + ) + }) + .attached_child(); async move { tracing::info!(%reason, "downloading on-demand"); @@ -1101,7 +1094,7 @@ impl LayerInner { let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); let res = self .download_init_and_wait(timeline, permit, ctx.attached_child()) - .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone()) .await?; scopeguard::ScopeGuard::into_inner(init_cancelled); @@ -1709,7 +1702,7 @@ impl DownloadError { } } -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Copy, Clone)] pub(crate) enum NeedsDownload { NotFound, NotFile(std::fs::FileType), From 0691b73f53580687f0a5aee8b1e3a3192faa7707 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Wed, 7 May 2025 14:14:24 +0200 Subject: [PATCH 40/40] fix(compute): Enforce cloud_admin role in compute_ctl connections (#11827) ## Problem Users can override some configuration parameters on the DB level with `ALTER DATABASE ... SET ...`. Some of these overrides, like `role` or `default_transaction_read_only`, affect `compute_ctl`'s ability to configure the DB schema properly. ## Summary of changes Enforce `role=cloud_admin`, `statement_timeout=0`, and move `default_transaction_read_only=off` override from control plane [1] to `compute_ctl`. Also, enforce `search_path=public` just in case, although we do not call any functions in user databases. [1]: https://github.com/neondatabase/cloud/blob/133dd8c4dbbba40edfbad475bf6a45073ca63faf/goapp/controlplane/internal/pkg/compute/provisioner/provisioner_common.go#L70 Fixes https://github.com/neondatabase/cloud/issues/28532 --- compute_tools/src/compute.rs | 45 +++++++++++--- test_runner/regress/test_compute_catalog.py | 66 +++++++++++++++++++++ 2 files changed, 104 insertions(+), 7 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 08d915b331..0cda36a6e2 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -329,11 +329,39 @@ struct StartVmMonitorResult { impl ComputeNode { pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result { let connstr = params.connstr.as_str(); - let conn_conf = postgres::config::Config::from_str(connstr) + let mut conn_conf = postgres::config::Config::from_str(connstr) .context("cannot build postgres config from connstr")?; - let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) + let mut tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) .context("cannot build tokio postgres config from connstr")?; + // Users can set some configuration parameters per database with + // ALTER DATABASE ... SET ... + // + // There are at least these parameters: + // + // - role=some_other_role + // - default_transaction_read_only=on + // - statement_timeout=1, i.e., 1ms, which will cause most of the queries to fail + // - search_path=non_public_schema, this should be actually safe because + // we don't call any functions in user databases, but better to always reset + // it to public. + // + // that can affect `compute_ctl` and prevent it from properly configuring the database schema. + // Unset them via connection string options before connecting to the database. + // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. + // + // TODO(ololobus): we currently pass `-c default_transaction_read_only=off` from control plane + // as well. After rolling out this code, we can remove this parameter from control plane. + // In the meantime, double-passing is fine, the last value is applied. + // See: + const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + let options = match conn_conf.get_options() { + Some(options) => format!("{} {}", options, EXTRA_OPTIONS), + None => EXTRA_OPTIONS.to_string(), + }; + conn_conf.options(&options); + tokio_conn_conf.options(&options); + let mut new_state = ComputeState::new(); if let Some(spec) = config.spec { let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; @@ -1449,15 +1477,20 @@ impl ComputeNode { Err(e) => match e.code() { Some(&SqlState::INVALID_PASSWORD) | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { - // Connect with zenith_admin if cloud_admin could not authenticate + // Connect with `zenith_admin` if `cloud_admin` could not authenticate info!( - "cannot connect to postgres: {}, retrying with `zenith_admin` username", + "cannot connect to Postgres: {}, retrying with 'zenith_admin' username", e ); let mut zenith_admin_conf = postgres::config::Config::from(conf.clone()); zenith_admin_conf.application_name("compute_ctl:apply_config"); zenith_admin_conf.user("zenith_admin"); + // It doesn't matter what were the options before, here we just want + // to connect and create a new superuser role. + const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + zenith_admin_conf.options(ZENITH_OPTIONS); + let mut client = zenith_admin_conf.connect(NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; @@ -1623,9 +1656,7 @@ impl ComputeNode { self.pg_reload_conf()?; if spec.mode == ComputeMode::Primary { - let mut conf = - tokio_postgres::Config::from_str(self.params.connstr.as_str()).unwrap(); - conf.application_name("apply_config"); + let conf = self.get_tokio_conn_conf(Some("compute_ctl:reconfigure")); let conf = Arc::new(conf); let spec = Arc::new(spec.clone()); diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 37208c9fff..b66b326360 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -544,3 +544,69 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env ) role = cursor.fetchone() assert role is None + + +def test_db_with_custom_settings(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can work with databases that have some custom settings. + For example, role=some_other_role, default_transaction_read_only=on, + search_path=non_public_schema, statement_timeout=1 (1ms). + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + TEST_ROLE = "some_other_role" + TEST_DB = "db_with_custom_settings" + TEST_SCHEMA = "non_public_schema" + + endpoint.respec_deep( + **{ + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "databases": [ + { + "name": TEST_DB, + "owner": TEST_ROLE, + } + ], + "roles": [ + { + "name": TEST_ROLE, + } + ], + }, + } + } + ) + + endpoint.reconfigure() + + with endpoint.cursor(dbname=TEST_DB) as cursor: + cursor.execute(f"CREATE SCHEMA {TEST_SCHEMA}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET role = {TEST_ROLE}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET default_transaction_read_only = on") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET search_path = {TEST_SCHEMA}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET statement_timeout = 1") + + with endpoint.cursor(dbname=TEST_DB) as cursor: + cursor.execute("SELECT current_role") + role = cursor.fetchone() + assert role is not None + assert role[0] == TEST_ROLE + + cursor.execute("SHOW default_transaction_read_only") + default_transaction_read_only = cursor.fetchone() + assert default_transaction_read_only is not None + assert default_transaction_read_only[0] == "on" + + cursor.execute("SHOW search_path") + search_path = cursor.fetchone() + assert search_path is not None + assert search_path[0] == TEST_SCHEMA + + # Do not check statement_timeout, because we force it to 2min + # in `endpoint.cursor()` fixture. + + endpoint.reconfigure()