From 30b57334efecddc0e0f9fc285416b62a861c196f Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 29 Jul 2025 10:47:39 +0100 Subject: [PATCH 01/34] test_lsn_lease_storcon: ignore ShardSplit warning in debug builds (#12770) ## Problem `test_lsn_lease_storcon` might fail in debug builds due to slow ShardSplit ## Summary of changes - Make `test_lsn_lease_storcon ` test to ignore `.*Exclusive lock by ShardSplit was held.*` warning in debug builds Ref: https://databricks.slack.com/archives/C09254R641L/p1753777051481029 --- test_runner/regress/test_tenant_size.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 8b291b7cbe..5564d9342c 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from concurrent.futures import ThreadPoolExecutor from typing import TYPE_CHECKING @@ -768,6 +769,14 @@ def test_lsn_lease_storcon(neon_env_builder: NeonEnvBuilder): "compaction_period": "0s", } env = neon_env_builder.init_start(initial_tenant_conf=conf) + # ShardSplit is slow in debug builds, so ignore the warning + if os.getenv("BUILD_TYPE", "debug") == "debug": + env.storage_controller.allowed_errors.extend( + [ + ".*Exclusive lock by ShardSplit was held.*", + ] + ) + with env.endpoints.create_start( "main", ) as ep: From 1ed72529507b9abc7ae2a2b80da025289e06a6bc Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Tue, 29 Jul 2025 12:19:10 +0200 Subject: [PATCH 02/34] Add a workaround for the clickhouse 24.9+ problem causing an error (#12767) ## Problem We used ClickHouse v. 24.8, which is outdated, for logical replication testing. We could miss some problems. ## Summary of changes The version was updated to 25.6, with a workaround using the environment variable `PGSSLCERT`. Co-authored-by: Alexey Masterov --- .github/workflows/pg-clients.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index b6b4eca2b8..40b2c51624 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -72,9 +72,10 @@ jobs: options: --init --user root services: clickhouse: - image: clickhouse/clickhouse-server:24.8 + image: clickhouse/clickhouse-server:25.6 env: CLICKHOUSE_PASSWORD: ${{ needs.generate-ch-tmppw.outputs.tmp_val }} + PGSSLCERT: /tmp/postgresql.crt ports: - 9000:9000 - 8123:8123 From 568927a8a01d1c49b2b19ad309f6219936bbdc13 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 29 Jul 2025 14:08:22 +0300 Subject: [PATCH 03/34] Remove unnecessary dependency to 'log' crate (#12763) We use 'tracing' everywhere. --- Cargo.lock | 1 - libs/postgres_ffi/Cargo.toml | 1 - libs/postgres_ffi/src/nonrelfile_utils.rs | 3 +-- libs/postgres_ffi/src/waldecoder_handler.rs | 3 +-- libs/postgres_ffi/src/xlog_utils.rs | 10 ++++------ 5 files changed, 6 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1919f8e99d..9a0cc9076a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5077,7 +5077,6 @@ dependencies = [ "crc32c", "criterion", "env_logger", - "log", "once_cell", "postgres", "postgres_ffi_types", diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index fca75b7bc1..5adf38f457 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -11,7 +11,6 @@ anyhow.workspace = true crc32c.workspace = true criterion.workspace = true once_cell.workspace = true -log.workspace = true pprof.workspace = true thiserror.workspace = true serde.workspace = true diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index e3e7133b94..f6693d4ec1 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -4,12 +4,11 @@ use crate::pg_constants; use crate::transaction_id_precedes; use bytes::BytesMut; -use log::*; use super::bindings::MultiXactId; pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) { - trace!( + tracing::trace!( "handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)", status ); diff --git a/libs/postgres_ffi/src/waldecoder_handler.rs b/libs/postgres_ffi/src/waldecoder_handler.rs index 9cd40645ec..563a3426a0 100644 --- a/libs/postgres_ffi/src/waldecoder_handler.rs +++ b/libs/postgres_ffi/src/waldecoder_handler.rs @@ -14,7 +14,6 @@ use super::xlog_utils::*; use crate::WAL_SEGMENT_SIZE; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crc32c::*; -use log::*; use std::cmp::min; use std::num::NonZeroU32; use utils::lsn::Lsn; @@ -236,7 +235,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder { // XLOG_SWITCH records are special. If we see one, we need to skip // to the next WAL segment. let next_lsn = if xlogrec.is_xlog_switch_record() { - trace!("saw xlog switch record at {}", self.lsn); + tracing::trace!("saw xlog switch record at {}", self.lsn); self.lsn + self.lsn.calc_padding(WAL_SEGMENT_SIZE as u64) } else { // Pad to an 8-byte boundary diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 134baf5ff7..913e6b453f 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -23,8 +23,6 @@ use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; use bytes::BytesMut; use bytes::{Buf, Bytes}; -use log::*; - use serde::Serialize; use std::ffi::{CString, OsStr}; use std::fs::File; @@ -235,7 +233,7 @@ pub fn find_end_of_wal( let mut curr_lsn = start_lsn; let mut buf = [0u8; XLOG_BLCKSZ]; let pg_version = MY_PGVERSION; - debug!("find_end_of_wal PG_VERSION: {}", pg_version); + tracing::debug!("find_end_of_wal PG_VERSION: {}", pg_version); let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); @@ -247,7 +245,7 @@ pub fn find_end_of_wal( match open_wal_segment(&seg_file_path)? { None => { // no more segments - debug!( + tracing::debug!( "find_end_of_wal reached end at {:?}, segment {:?} doesn't exist", result, seg_file_path ); @@ -260,7 +258,7 @@ pub fn find_end_of_wal( while curr_lsn.segment_number(wal_seg_size) == segno { let bytes_read = segment.read(&mut buf)?; if bytes_read == 0 { - debug!( + tracing::debug!( "find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}", result, seg_file_path, @@ -276,7 +274,7 @@ pub fn find_end_of_wal( match decoder.poll_decode() { Ok(Some(record)) => result = record.0, Err(e) => { - debug!( + tracing::debug!( "find_end_of_wal reached end at {:?}, decode error: {:?}", result, e ); From 58327cbba8f39610f08413e4f20702b58d072d2a Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Tue, 29 Jul 2025 15:58:31 +0400 Subject: [PATCH 04/34] storcon: wait for the migration from the drained node in the draining loop (#12754) ## Problem We have seen some errors in staging when the shard migration was triggered by optimizations, and it was ongoing during draining the node it was migrating from. It happens because the node draining loop only waits for the migrations started by the drain loop itself. The ongoing migrations are ignored. Closes: https://databricks.atlassian.net/browse/LKB-1625 ## Summary of changes - Wait for the shard reconciliation during the drain if it is being migrated from the drained node. --- storage_controller/src/operation_utils.rs | 46 ++++++++++++++++++++--- storage_controller/src/service.rs | 39 ++++++++++--------- storage_controller/src/tenant_shard.rs | 2 - 3 files changed, 62 insertions(+), 25 deletions(-) diff --git a/storage_controller/src/operation_utils.rs b/storage_controller/src/operation_utils.rs index af86010ab7..1060c92832 100644 --- a/storage_controller/src/operation_utils.rs +++ b/storage_controller/src/operation_utils.rs @@ -46,11 +46,31 @@ impl TenantShardDrain { &self, tenants: &BTreeMap, scheduler: &Scheduler, - ) -> Option { - let tenant_shard = tenants.get(&self.tenant_shard_id)?; + ) -> TenantShardDrainAction { + let Some(tenant_shard) = tenants.get(&self.tenant_shard_id) else { + return TenantShardDrainAction::Skip; + }; if *tenant_shard.intent.get_attached() != Some(self.drained_node) { - return None; + // If the intent attached node is not the drained node, check the observed state + // of the shard on the drained node. If it is Attached*, it means the shard is + // beeing migrated from the drained node. The drain loop needs to wait for the + // reconciliation to complete for a smooth draining. + + use pageserver_api::models::LocationConfigMode::*; + + let attach_mode = tenant_shard + .observed + .locations + .get(&self.drained_node) + .and_then(|observed| observed.conf.as_ref().map(|conf| conf.mode)); + + return match (attach_mode, tenant_shard.intent.get_attached()) { + (Some(AttachedSingle | AttachedMulti | AttachedStale), Some(intent_node_id)) => { + TenantShardDrainAction::Reconcile(*intent_node_id) + } + _ => TenantShardDrainAction::Skip, + }; } // Only tenants with a normal (Active) scheduling policy are proactively moved @@ -63,19 +83,19 @@ impl TenantShardDrain { } ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => { // If we have been asked to avoid rescheduling this shard, then do not migrate it during a drain - return None; + return TenantShardDrainAction::Skip; } } match tenant_shard.preferred_secondary(scheduler) { - Some(node) => Some(node), + Some(node) => TenantShardDrainAction::RescheduleToSecondary(node), None => { tracing::warn!( tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), "No eligible secondary while draining {}", self.drained_node ); - None + TenantShardDrainAction::Skip } } } @@ -138,3 +158,17 @@ impl TenantShardDrain { } } } + +/// Action to take when draining a tenant shard. +pub(crate) enum TenantShardDrainAction { + /// The tenant shard is on the draining node. + /// Reschedule the tenant shard to a secondary location. + /// Holds a destination node id to reschedule to. + RescheduleToSecondary(NodeId), + /// The tenant shard is beeing migrated from the draining node. + /// Wait for the reconciliation to complete. + /// Holds the intent attached node id. + Reconcile(NodeId), + /// The tenant shard is not eligible for drainining, skip it. + Skip, +} diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 4c011033af..37380b8fbe 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -79,7 +79,7 @@ use crate::id_lock_map::{ use crate::leadership::Leadership; use crate::metrics; use crate::node::{AvailabilityTransition, Node}; -use crate::operation_utils::{self, TenantShardDrain}; +use crate::operation_utils::{self, TenantShardDrain, TenantShardDrainAction}; use crate::pageserver_client::PageserverClient; use crate::peer_client::GlobalObservedState; use crate::persistence::split_state::SplitState; @@ -1274,7 +1274,7 @@ impl Service { // Always attempt autosplits. Sharding is crucial for bulk ingest performance, so we // must be responsive when new projects begin ingesting and reach the threshold. self.autosplit_tenants().await; - } + }, _ = self.reconcilers_cancel.cancelled() => return } } @@ -8876,6 +8876,9 @@ impl Service { for (_tenant_id, schedule_context, shards) in TenantShardExclusiveIterator::new(tenants, ScheduleMode::Speculative) { + if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS { + break; + } for shard in shards { if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS { break; @@ -9640,16 +9643,16 @@ impl Service { tenant_shard_id: tid, }; - let dest_node_id = { + let drain_action = { let locked = self.inner.read().unwrap(); + tid_drain.tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler) + }; - match tid_drain - .tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler) - { - Some(node_id) => node_id, - None => { - continue; - } + let dest_node_id = match drain_action { + TenantShardDrainAction::RescheduleToSecondary(dest_node_id) => dest_node_id, + TenantShardDrainAction::Reconcile(intent_node_id) => intent_node_id, + TenantShardDrainAction::Skip => { + continue; } }; @@ -9684,14 +9687,16 @@ impl Service { { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); - let rescheduled = tid_drain.reschedule_to_secondary( - dest_node_id, - tenants, - scheduler, - nodes, - )?; - if let Some(tenant_shard) = rescheduled { + let tenant_shard = match drain_action { + TenantShardDrainAction::RescheduleToSecondary(dest_node_id) => tid_drain + .reschedule_to_secondary(dest_node_id, tenants, scheduler, nodes)?, + TenantShardDrainAction::Reconcile(_) => tenants.get_mut(&tid), + // Note: Unreachable, handled above. + TenantShardDrainAction::Skip => None, + }; + + if let Some(tenant_shard) = tenant_shard { let waiter = self.maybe_configured_reconcile_shard( tenant_shard, nodes, diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 3eb54d714d..bf16c642af 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -812,8 +812,6 @@ impl TenantShard { /// if the swap is not possible and leaves the intent state in its original state. /// /// Arguments: - /// `attached_to`: the currently attached location matching the intent state (may be None if the - /// shard is not attached) /// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask /// the scheduler to recommend a node pub(crate) fn reschedule_to_secondary( From e2411818ef8ec65ac14a24587927275c3bef9d7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Tue, 29 Jul 2025 14:12:14 +0200 Subject: [PATCH 05/34] Add SBOMs and provenance attestations to container images (#12768) ## Problem Given a container image it is difficult to figure out dependencies and doesn't work automatically. ## Summary of changes - Build all rust binaries with `cargo auditable`, to allow sbom scanners to find it's dependencies. - Adjust `attests` for `docker/build-push-action`, so that buildkit creates sbom and provenance attestations. - Dropping `--locked` for `rustfilt`, because `rustfilt` can't build with locked dependencies[^5] ## Further details Building with `cargo auditable`[^1] embeds a dependency list into Linux, Windows, MacOS and WebAssembly artifacts. A bunch of tools support discovering dependencies from this, among them `syft`[^2], which is used by the BuildKit Syft scanner[^3] plugin. This BuildKit plugin is the default[^4] used in docker for generating sbom attestations, but we're making that default explicit by referencing the container image. [^1]: https://github.com/rust-secure-code/cargo-auditable [^2]: https://github.com/anchore/syft [^3]: https://github.com/docker/buildkit-syft-scanner [^4]: https://docs.docker.com/build/metadata/attestations/sbom/#sbom-generator [^5]: https://github.com/luser/rustfilt/issues/23 --- .github/workflows/build-build-tools-image.yml | 4 +++- .github/workflows/build_and_test.yml | 12 +++++++++--- Dockerfile | 2 +- build-tools/Dockerfile | 19 +++++++++++-------- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 24e4c8fa3d..5e53d8231f 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -146,7 +146,9 @@ jobs: with: file: build-tools/Dockerfile context: . - provenance: false + attests: | + type=provenance,mode=max + type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1 push: true pull: true build-args: | diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index f237a991cc..0dcbd1c6dd 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -634,7 +634,9 @@ jobs: DEBIAN_VERSION=bookworm secrets: | SUBZERO_ACCESS_TOKEN=${{ secrets.CI_ACCESS_TOKEN }} - provenance: false + attests: | + type=provenance,mode=max + type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1 push: true pull: true file: Dockerfile @@ -747,7 +749,9 @@ jobs: PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} DEBIAN_VERSION=${{ matrix.version.debian }} - provenance: false + attests: | + type=provenance,mode=max + type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1 push: true pull: true file: compute/compute-node.Dockerfile @@ -766,7 +770,9 @@ jobs: PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} DEBIAN_VERSION=${{ matrix.version.debian }} - provenance: false + attests: | + type=provenance,mode=max + type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1 push: true pull: true file: compute/compute-node.Dockerfile diff --git a/Dockerfile b/Dockerfile index 654ae72e56..63cc954873 100644 --- a/Dockerfile +++ b/Dockerfile @@ -103,7 +103,7 @@ RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \ && if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \ export CARGO_FEATURES="rest_broker"; \ fi \ - && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \ + && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo auditable build \ --features $CARGO_FEATURES \ --bin pg_sni_router \ --bin pageserver \ diff --git a/build-tools/Dockerfile b/build-tools/Dockerfile index 87966591c1..c9760f610b 100644 --- a/build-tools/Dockerfile +++ b/build-tools/Dockerfile @@ -299,6 +299,7 @@ WORKDIR /home/nonroot ENV RUSTC_VERSION=1.88.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" +ARG CARGO_AUDITABLE_VERSION=0.7.0 ARG RUSTFILT_VERSION=0.2.1 ARG CARGO_HAKARI_VERSION=0.9.36 ARG CARGO_DENY_VERSION=0.18.2 @@ -314,14 +315,16 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux . "$HOME/.cargo/env" && \ cargo --version && rustup --version && \ rustup component add llvm-tools rustfmt clippy && \ - cargo install rustfilt --locked --version "${RUSTFILT_VERSION}" && \ - cargo install cargo-hakari --locked --version "${CARGO_HAKARI_VERSION}" && \ - cargo install cargo-deny --locked --version "${CARGO_DENY_VERSION}" && \ - cargo install cargo-hack --locked --version "${CARGO_HACK_VERSION}" && \ - cargo install cargo-nextest --locked --version "${CARGO_NEXTEST_VERSION}" && \ - cargo install cargo-chef --locked --version "${CARGO_CHEF_VERSION}" && \ - cargo install diesel_cli --locked --version "${CARGO_DIESEL_CLI_VERSION}" \ - --features postgres-bundled --no-default-features && \ + cargo install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" && \ + cargo auditable install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" --force && \ + cargo auditable install rustfilt --version "${RUSTFILT_VERSION}" && \ + cargo auditable install cargo-hakari --locked --version "${CARGO_HAKARI_VERSION}" && \ + cargo auditable install cargo-deny --locked --version "${CARGO_DENY_VERSION}" && \ + cargo auditable install cargo-hack --locked --version "${CARGO_HACK_VERSION}" && \ + cargo auditable install cargo-nextest --locked --version "${CARGO_NEXTEST_VERSION}" && \ + cargo auditable install cargo-chef --locked --version "${CARGO_CHEF_VERSION}" && \ + cargo auditable install diesel_cli --locked --version "${CARGO_DIESEL_CLI_VERSION}" \ + --features postgres-bundled --no-default-features && \ rm -rf /home/nonroot/.cargo/registry && \ rm -rf /home/nonroot/.cargo/git From 61f267d8f9bcb5c86013ecacdd8a588379337352 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 29 Jul 2025 14:33:02 +0200 Subject: [PATCH 06/34] pageserver: only retry `WaitForActiveTimeout` during shard resolution (#12772) ## Problem In https://github.com/neondatabase/neon/pull/12467, timeouts and retries were added to `Cache::get` tenant shard resolution to paper over an issue with read unavailability during shard splits. However, this retries _all_ errors, including irrecoverable errors like `NotFound`. This causes problems with gRPC child shard routing in #12702, which targets specific shards with `ShardSelector::Known` and relies on prompt `NotFound` errors to reroute requests to child shards. These retries introduce a 1s delay for all reads during child routing. The broader problem of read unavailability during shard splits is left as future work, see https://databricks.atlassian.net/browse/LKB-672. Touches #12702. Touches [LKB-191](https://databricks.atlassian.net/browse/LKB-191). ## Summary of changes * Change `TenantManager` to always return a concrete `GetActiveTimelineError`. * Only retry `WaitForActiveTimeout` errors. * Lots of code unindentation due to the simplified error handling. Out of caution, we do not gate the retries on `ShardSelector`, since this can trigger other races. Improvements here are left as future work. --- pageserver/src/page_service.rs | 9 -- pageserver/src/tenant/mgr.rs | 6 + pageserver/src/tenant/timeline/handle.rs | 157 +++++++++++------------ 3 files changed, 82 insertions(+), 90 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index a0998a7598..b3bc42a55e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -466,13 +466,6 @@ impl TimelineHandles { self.handles .get(timeline_id, shard_selector, &self.wrapper) .await - .map_err(|e| match e { - timeline::handle::GetError::TenantManager(e) => e, - timeline::handle::GetError::PerTimelineStateShutDown => { - trace!("per-timeline state shut down"); - GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) - } - }) } fn tenant_id(&self) -> Option { @@ -488,11 +481,9 @@ pub(crate) struct TenantManagerWrapper { tenant_id: once_cell::sync::OnceCell, } -#[derive(Debug)] pub(crate) struct TenantManagerTypes; impl timeline::handle::Types for TenantManagerTypes { - type TenantManagerError = GetActiveTimelineError; type TenantManager = TenantManagerWrapper; type Timeline = TenantManagerCacheItem; } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index b47bab16d8..4432b4bba8 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1522,6 +1522,12 @@ impl TenantManager { self.resources.deletion_queue_client.flush_advisory(); // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant + // + // TODO: keeping the parent as InProgress while spawning the children causes read + // unavailability, as we can't acquire a timeline handle for it. The parent should be + // available for reads until the children are ready -- potentially until *all* subsplits + // across all parent shards are complete and the compute has been notified. See: + // . drop(tenant); let mut parent_slot_guard = self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 3570cab301..537b9ff373 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -224,11 +224,11 @@ use tracing::{instrument, trace}; use utils::id::TimelineId; use utils::shard::{ShardIndex, ShardNumber}; -use crate::tenant::mgr::ShardSelector; +use crate::page_service::GetActiveTimelineError; +use crate::tenant::GetTimelineError; +use crate::tenant::mgr::{GetActiveTenantError, ShardSelector}; -/// The requirement for Debug is so that #[derive(Debug)] works in some places. -pub(crate) trait Types: Sized + std::fmt::Debug { - type TenantManagerError: Sized + std::fmt::Debug; +pub(crate) trait Types: Sized { type TenantManager: TenantManager + Sized; type Timeline: Timeline + Sized; } @@ -307,12 +307,11 @@ impl Default for PerTimelineState { /// Abstract view of [`crate::tenant::mgr`], for testability. pub(crate) trait TenantManager { /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`]. - /// Errors are returned as [`GetError::TenantManager`]. async fn resolve( &self, timeline_id: TimelineId, shard_selector: ShardSelector, - ) -> Result; + ) -> Result; } /// Abstract view of an [`Arc`], for testability. @@ -322,13 +321,6 @@ pub(crate) trait Timeline { fn per_timeline_state(&self) -> &PerTimelineState; } -/// Errors returned by [`Cache::get`]. -#[derive(Debug)] -pub(crate) enum GetError { - TenantManager(T::TenantManagerError), - PerTimelineStateShutDown, -} - /// Internal type used in [`Cache::get`]. enum RoutingResult { FastPath(Handle), @@ -345,7 +337,7 @@ impl Cache { timeline_id: TimelineId, shard_selector: ShardSelector, tenant_manager: &T::TenantManager, - ) -> Result, GetError> { + ) -> Result, GetActiveTimelineError> { const GET_MAX_RETRIES: usize = 10; const RETRY_BACKOFF: Duration = Duration::from_millis(100); let mut attempt = 0; @@ -356,7 +348,11 @@ impl Cache { .await { Ok(handle) => return Ok(handle), - Err(e) => { + Err( + e @ GetActiveTimelineError::Tenant(GetActiveTenantError::WaitForActiveTimeout { + .. + }), + ) => { // Retry on tenant manager error to handle tenant split more gracefully if attempt < GET_MAX_RETRIES { tokio::time::sleep(RETRY_BACKOFF).await; @@ -370,6 +366,7 @@ impl Cache { return Err(e); } } + Err(err) => return Err(err), } } } @@ -388,7 +385,7 @@ impl Cache { timeline_id: TimelineId, shard_selector: ShardSelector, tenant_manager: &T::TenantManager, - ) -> Result, GetError> { + ) -> Result, GetActiveTimelineError> { // terminates because when every iteration we remove an element from the map let miss: ShardSelector = loop { let routing_state = self.shard_routing(timeline_id, shard_selector); @@ -468,60 +465,50 @@ impl Cache { timeline_id: TimelineId, shard_selector: ShardSelector, tenant_manager: &T::TenantManager, - ) -> Result, GetError> { - match tenant_manager.resolve(timeline_id, shard_selector).await { - Ok(timeline) => { - let key = timeline.shard_timeline_id(); - match &shard_selector { - ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)), - ShardSelector::Page(_) => (), // gotta trust tenant_manager - ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index), - } - - trace!("creating new HandleInner"); - let timeline = Arc::new(timeline); - let handle_inner_arc = - Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline)))); - let handle_weak = WeakHandle { - inner: Arc::downgrade(&handle_inner_arc), - }; - let handle = handle_weak - .upgrade() - .ok() - .expect("we just created it and it's not linked anywhere yet"); - { - let mut lock_guard = timeline - .per_timeline_state() - .handles - .lock() - .expect("mutex poisoned"); - match &mut *lock_guard { - Some(per_timeline_state) => { - let replaced = - per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc)); - assert!(replaced.is_none(), "some earlier code left a stale handle"); - match self.map.entry(key) { - hash_map::Entry::Occupied(_o) => { - // This cannot not happen because - // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and - // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle - // while we were waiting for the tenant manager. - unreachable!() - } - hash_map::Entry::Vacant(v) => { - v.insert(handle_weak); - } - } - } - None => { - return Err(GetError::PerTimelineStateShutDown); - } - } - } - Ok(handle) - } - Err(e) => Err(GetError::TenantManager(e)), + ) -> Result, GetActiveTimelineError> { + let timeline = tenant_manager.resolve(timeline_id, shard_selector).await?; + let key = timeline.shard_timeline_id(); + match &shard_selector { + ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)), + ShardSelector::Page(_) => (), // gotta trust tenant_manager + ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index), } + + trace!("creating new HandleInner"); + let timeline = Arc::new(timeline); + let handle_inner_arc = Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline)))); + let handle_weak = WeakHandle { + inner: Arc::downgrade(&handle_inner_arc), + }; + let handle = handle_weak + .upgrade() + .ok() + .expect("we just created it and it's not linked anywhere yet"); + let mut lock_guard = timeline + .per_timeline_state() + .handles + .lock() + .expect("mutex poisoned"); + let Some(per_timeline_state) = &mut *lock_guard else { + return Err(GetActiveTimelineError::Timeline( + GetTimelineError::ShuttingDown, + )); + }; + let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc)); + assert!(replaced.is_none(), "some earlier code left a stale handle"); + match self.map.entry(key) { + hash_map::Entry::Occupied(_o) => { + // This cannot not happen because + // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and + // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle + // while we were waiting for the tenant manager. + unreachable!() + } + hash_map::Entry::Vacant(v) => { + v.insert(handle_weak); + } + } + Ok(handle) } } @@ -655,7 +642,8 @@ mod tests { use pageserver_api::models::ShardParameters; use pageserver_api::reltag::RelTag; use pageserver_api::shard::DEFAULT_STRIPE_SIZE; - use utils::shard::ShardCount; + use utils::id::TenantId; + use utils::shard::{ShardCount, TenantShardId}; use utils::sync::gate::GateGuard; use super::*; @@ -665,7 +653,6 @@ mod tests { #[derive(Debug)] struct TestTypes; impl Types for TestTypes { - type TenantManagerError = anyhow::Error; type TenantManager = StubManager; type Timeline = Entered; } @@ -716,40 +703,48 @@ mod tests { &self, timeline_id: TimelineId, shard_selector: ShardSelector, - ) -> anyhow::Result { + ) -> Result { + fn enter_gate( + timeline: &StubTimeline, + ) -> Result, GetActiveTimelineError> { + Ok(Arc::new(timeline.gate.enter().map_err(|_| { + GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) + })?)) + } + for timeline in &self.shards { if timeline.id == timeline_id { - let enter_gate = || { - let gate_guard = timeline.gate.enter()?; - let gate_guard = Arc::new(gate_guard); - anyhow::Ok(gate_guard) - }; match &shard_selector { ShardSelector::Zero if timeline.shard.is_shard_zero() => { return Ok(Entered { timeline: Arc::clone(timeline), - gate_guard: enter_gate()?, + gate_guard: enter_gate(timeline)?, }); } ShardSelector::Zero => continue, ShardSelector::Page(key) if timeline.shard.is_key_local(key) => { return Ok(Entered { timeline: Arc::clone(timeline), - gate_guard: enter_gate()?, + gate_guard: enter_gate(timeline)?, }); } ShardSelector::Page(_) => continue, ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => { return Ok(Entered { timeline: Arc::clone(timeline), - gate_guard: enter_gate()?, + gate_guard: enter_gate(timeline)?, }); } ShardSelector::Known(_) => continue, } } } - anyhow::bail!("not found") + Err(GetActiveTimelineError::Timeline( + GetTimelineError::NotFound { + tenant_id: TenantShardId::unsharded(TenantId::from([0; 16])), + timeline_id, + }, + )) } } From 5e3cb2ab070c77d8f836fa0f0008f8dd08f25875 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 29 Jul 2025 16:12:44 +0300 Subject: [PATCH 07/34] Refactor LFC stats functions (#12696) Split the functions into two parts: an internal function in file_cache.c which returns an array of structs representing the result set, and another function in neon.c with the glue code to expose it as a SQL function. This is in preparation for the new communicator, which needs to implement the same SQL functions, but getting the information from a different place. In the glue code, use the more modern Postgres way of building a result set using a tuplestore. --- pgxn/neon/file_cache.c | 386 +++++++++++------------------------------ pgxn/neon/file_cache.h | 20 +++ pgxn/neon/neon.c | 74 ++++++++ 3 files changed, 194 insertions(+), 286 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 88086689c8..3c680eab86 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -1832,125 +1832,46 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, LWLockRelease(lfc_lock); } -typedef struct +/* + * Return metrics about the LFC. + * + * The return format is a palloc'd array of LfcStatsEntrys. The size + * of the returned array is returned in *num_entries. + */ +LfcStatsEntry * +lfc_get_stats(size_t *num_entries) { - TupleDesc tupdesc; -} NeonGetStatsCtx; + LfcStatsEntry *entries; + size_t n = 0; -#define NUM_NEON_GET_STATS_COLS 2 +#define MAX_ENTRIES 10 + entries = palloc(sizeof(LfcStatsEntry) * MAX_ENTRIES); -PG_FUNCTION_INFO_V1(neon_get_lfc_stats); -Datum -neon_get_lfc_stats(PG_FUNCTION_ARGS) -{ - FuncCallContext *funcctx; - NeonGetStatsCtx *fctx; - MemoryContext oldcontext; - TupleDesc tupledesc; - Datum result; - HeapTuple tuple; - char const *key; - uint64 value = 0; - Datum values[NUM_NEON_GET_STATS_COLS]; - bool nulls[NUM_NEON_GET_STATS_COLS]; + entries[n++] = (LfcStatsEntry) {"file_cache_chunk_size_pages", lfc_ctl == NULL, + lfc_ctl ? lfc_blocks_per_chunk : 0 }; + entries[n++] = (LfcStatsEntry) {"file_cache_misses", lfc_ctl == NULL, + lfc_ctl ? lfc_ctl->misses : 0}; + entries[n++] = (LfcStatsEntry) {"file_cache_hits", lfc_ctl == NULL, + lfc_ctl ? lfc_ctl->hits : 0 }; + entries[n++] = (LfcStatsEntry) {"file_cache_used", lfc_ctl == NULL, + lfc_ctl ? lfc_ctl->used : 0 }; + entries[n++] = (LfcStatsEntry) {"file_cache_writes", lfc_ctl == NULL, + lfc_ctl ? lfc_ctl->writes : 0 }; + entries[n++] = (LfcStatsEntry) {"file_cache_size", lfc_ctl == NULL, + lfc_ctl ? lfc_ctl->size : 0 }; + entries[n++] = (LfcStatsEntry) {"file_cache_used_pages", lfc_ctl == NULL, + lfc_ctl ? lfc_ctl->used_pages : 0 }; + entries[n++] = (LfcStatsEntry) {"file_cache_evicted_pages", lfc_ctl == NULL, + lfc_ctl ? lfc_ctl->evicted_pages : 0 }; + entries[n++] = (LfcStatsEntry) {"file_cache_limit", lfc_ctl == NULL, + lfc_ctl ? lfc_ctl->limit : 0 }; + entries[n++] = (LfcStatsEntry) {"file_cache_chunks_pinned", lfc_ctl == NULL, + lfc_ctl ? lfc_ctl->pinned : 0 }; + Assert(n <= MAX_ENTRIES); +#undef MAX_ENTRIES - if (SRF_IS_FIRSTCALL()) - { - funcctx = SRF_FIRSTCALL_INIT(); - - /* Switch context when allocating stuff to be used in later calls */ - oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - - /* Create a user function context for cross-call persistence */ - fctx = (NeonGetStatsCtx *) palloc(sizeof(NeonGetStatsCtx)); - - /* Construct a tuple descriptor for the result rows. */ - tupledesc = CreateTemplateTupleDesc(NUM_NEON_GET_STATS_COLS); - - TupleDescInitEntry(tupledesc, (AttrNumber) 1, "lfc_key", - TEXTOID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 2, "lfc_value", - INT8OID, -1, 0); - - fctx->tupdesc = BlessTupleDesc(tupledesc); - funcctx->user_fctx = fctx; - - /* Return to original context when allocating transient memory */ - MemoryContextSwitchTo(oldcontext); - } - - funcctx = SRF_PERCALL_SETUP(); - - /* Get the saved state */ - fctx = (NeonGetStatsCtx *) funcctx->user_fctx; - - switch (funcctx->call_cntr) - { - case 0: - key = "file_cache_misses"; - if (lfc_ctl) - value = lfc_ctl->misses; - break; - case 1: - key = "file_cache_hits"; - if (lfc_ctl) - value = lfc_ctl->hits; - break; - case 2: - key = "file_cache_used"; - if (lfc_ctl) - value = lfc_ctl->used; - break; - case 3: - key = "file_cache_writes"; - if (lfc_ctl) - value = lfc_ctl->writes; - break; - case 4: - key = "file_cache_size"; - if (lfc_ctl) - value = lfc_ctl->size; - break; - case 5: - key = "file_cache_used_pages"; - if (lfc_ctl) - value = lfc_ctl->used_pages; - break; - case 6: - key = "file_cache_evicted_pages"; - if (lfc_ctl) - value = lfc_ctl->evicted_pages; - break; - case 7: - key = "file_cache_limit"; - if (lfc_ctl) - value = lfc_ctl->limit; - break; - case 8: - key = "file_cache_chunk_size_pages"; - value = lfc_blocks_per_chunk; - break; - case 9: - key = "file_cache_chunks_pinned"; - if (lfc_ctl) - value = lfc_ctl->pinned; - break; - default: - SRF_RETURN_DONE(funcctx); - } - values[0] = PointerGetDatum(cstring_to_text(key)); - nulls[0] = false; - if (lfc_ctl) - { - nulls[1] = false; - values[1] = Int64GetDatum(value); - } - else - nulls[1] = true; - - tuple = heap_form_tuple(fctx->tupdesc, values, nulls); - result = HeapTupleGetDatum(tuple); - SRF_RETURN_NEXT(funcctx, result); + *num_entries = n; + return entries; } @@ -1958,193 +1879,86 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) * Function returning data from the local file cache * relation node/tablespace/database/blocknum and access_counter */ -PG_FUNCTION_INFO_V1(local_cache_pages); - -/* - * Record structure holding the to be exposed cache data. - */ -typedef struct +LocalCachePagesRec * +lfc_local_cache_pages(size_t *num_entries) { - uint32 pageoffs; - Oid relfilenode; - Oid reltablespace; - Oid reldatabase; - ForkNumber forknum; - BlockNumber blocknum; - uint16 accesscount; -} LocalCachePagesRec; + HASH_SEQ_STATUS status; + FileCacheEntry *entry; + size_t n_pages; + size_t n; + LocalCachePagesRec *result; -/* - * Function context for data persisting over repeated calls. - */ -typedef struct -{ - TupleDesc tupdesc; - LocalCachePagesRec *record; -} LocalCachePagesContext; - - -#define NUM_LOCALCACHE_PAGES_ELEM 7 - -Datum -local_cache_pages(PG_FUNCTION_ARGS) -{ - FuncCallContext *funcctx; - Datum result; - MemoryContext oldcontext; - LocalCachePagesContext *fctx; /* User function context. */ - TupleDesc tupledesc; - TupleDesc expected_tupledesc; - HeapTuple tuple; - - if (SRF_IS_FIRSTCALL()) + if (!lfc_ctl) { - HASH_SEQ_STATUS status; - FileCacheEntry *entry; - uint32 n_pages = 0; + *num_entries = 0; + return NULL; + } - funcctx = SRF_FIRSTCALL_INIT(); + LWLockAcquire(lfc_lock, LW_SHARED); + if (!LFC_ENABLED()) + { + LWLockRelease(lfc_lock); + *num_entries = 0; + return NULL; + } - /* Switch context when allocating stuff to be used in later calls */ - oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - - /* Create a user function context for cross-call persistence */ - fctx = (LocalCachePagesContext *) palloc(sizeof(LocalCachePagesContext)); - - /* - * To smoothly support upgrades from version 1.0 of this extension - * transparently handle the (non-)existence of the pinning_backends - * column. We unfortunately have to get the result type for that... - - * we can't use the result type determined by the function definition - * without potentially crashing when somebody uses the old (or even - * wrong) function definition though. - */ - if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) - neon_log(ERROR, "return type must be a row type"); - - if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM) - neon_log(ERROR, "incorrect number of output arguments"); - - /* Construct a tuple descriptor for the result rows. */ - tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); - TupleDescInitEntry(tupledesc, (AttrNumber) 1, "pageoffs", - INT8OID, -1, 0); -#if PG_MAJORVERSION_NUM < 16 - TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", - OIDOID, -1, 0); -#else - TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenumber", - OIDOID, -1, 0); -#endif - TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", - OIDOID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", - OIDOID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber", - INT2OID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber", - INT8OID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 7, "accesscount", - INT4OID, -1, 0); - - fctx->tupdesc = BlessTupleDesc(tupledesc); - - if (lfc_ctl) + /* Count the pages first */ + n_pages = 0; + hash_seq_init(&status, lfc_hash); + while ((entry = hash_seq_search(&status)) != NULL) + { + /* Skip hole tags */ + if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0) { - LWLockAcquire(lfc_lock, LW_SHARED); + for (int i = 0; i < lfc_blocks_per_chunk; i++) + n_pages += GET_STATE(entry, i) == AVAILABLE; + } + } - if (LFC_ENABLED()) + if (n_pages == 0) + { + LWLockRelease(lfc_lock); + *num_entries = 0; + return NULL; + } + + result = (LocalCachePagesRec *) + MemoryContextAllocHuge(CurrentMemoryContext, + sizeof(LocalCachePagesRec) * n_pages); + + /* + * Scan through all the cache entries, saving the relevant fields + * in the result structure. + */ + n = 0; + hash_seq_init(&status, lfc_hash); + while ((entry = hash_seq_search(&status)) != NULL) + { + for (int i = 0; i < lfc_blocks_per_chunk; i++) + { + if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0) { - hash_seq_init(&status, lfc_hash); - while ((entry = hash_seq_search(&status)) != NULL) + if (GET_STATE(entry, i) == AVAILABLE) { - /* Skip hole tags */ - if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0) - { - for (int i = 0; i < lfc_blocks_per_chunk; i++) - n_pages += GET_STATE(entry, i) == AVAILABLE; - } + result[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i; + result[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); + result[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key)); + result[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key)); + result[n].forknum = entry->key.forkNum; + result[n].blocknum = entry->key.blockNum + i; + result[n].accesscount = entry->access_count; + n += 1; } } } - fctx->record = (LocalCachePagesRec *) - MemoryContextAllocHuge(CurrentMemoryContext, - sizeof(LocalCachePagesRec) * n_pages); - - /* Set max calls and remember the user function context. */ - funcctx->max_calls = n_pages; - funcctx->user_fctx = fctx; - - /* Return to original context when allocating transient memory */ - MemoryContextSwitchTo(oldcontext); - - if (n_pages != 0) - { - /* - * Scan through all the cache entries, saving the relevant fields - * in the fctx->record structure. - */ - uint32 n = 0; - - hash_seq_init(&status, lfc_hash); - while ((entry = hash_seq_search(&status)) != NULL) - { - for (int i = 0; i < lfc_blocks_per_chunk; i++) - { - if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0) - { - if (GET_STATE(entry, i) == AVAILABLE) - { - fctx->record[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i; - fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); - fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key)); - fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key)); - fctx->record[n].forknum = entry->key.forkNum; - fctx->record[n].blocknum = entry->key.blockNum + i; - fctx->record[n].accesscount = entry->access_count; - n += 1; - } - } - } - } - Assert(n_pages == n); - } - if (lfc_ctl) - LWLockRelease(lfc_lock); } + Assert(n_pages == n); + LWLockRelease(lfc_lock); - funcctx = SRF_PERCALL_SETUP(); - - /* Get the saved state */ - fctx = funcctx->user_fctx; - - if (funcctx->call_cntr < funcctx->max_calls) - { - uint32 i = funcctx->call_cntr; - Datum values[NUM_LOCALCACHE_PAGES_ELEM]; - bool nulls[NUM_LOCALCACHE_PAGES_ELEM] = { - false, false, false, false, false, false, false - }; - - values[0] = Int64GetDatum((int64) fctx->record[i].pageoffs); - values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode); - values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); - values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); - values[4] = ObjectIdGetDatum(fctx->record[i].forknum); - values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); - values[6] = Int32GetDatum(fctx->record[i].accesscount); - - /* Build and return the tuple. */ - tuple = heap_form_tuple(fctx->tupdesc, values, nulls); - result = HeapTupleGetDatum(tuple); - - SRF_RETURN_NEXT(funcctx, result); - } - else - SRF_RETURN_DONE(funcctx); + *num_entries = n_pages; + return result; } - /* * Internal implementation of the approximate_working_set_size_seconds() * function. diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h index 14e5d4f753..4145327942 100644 --- a/pgxn/neon/file_cache.h +++ b/pgxn/neon/file_cache.h @@ -47,6 +47,26 @@ extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blk extern FileCacheState* lfc_get_state(size_t max_entries); extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers); +typedef struct LfcStatsEntry +{ + const char *metric_name; + bool isnull; + uint64 value; +} LfcStatsEntry; +extern LfcStatsEntry *lfc_get_stats(size_t *num_entries); + +typedef struct +{ + uint32 pageoffs; + Oid relfilenode; + Oid reltablespace; + Oid reldatabase; + ForkNumber forknum; + BlockNumber blocknum; + uint16 accesscount; +} LocalCachePagesRec; +extern LocalCachePagesRec *lfc_local_cache_pages(size_t *num_entries); + extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 6cd21cce39..07de696d7b 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -625,11 +625,15 @@ _PG_init(void) ExecutorEnd_hook = neon_ExecutorEnd; } +/* Various functions exposed at SQL level */ + PG_FUNCTION_INFO_V1(pg_cluster_size); PG_FUNCTION_INFO_V1(backpressure_lsns); PG_FUNCTION_INFO_V1(backpressure_throttling_time); PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds); PG_FUNCTION_INFO_V1(approximate_working_set_size); +PG_FUNCTION_INFO_V1(neon_get_lfc_stats); +PG_FUNCTION_INFO_V1(local_cache_pages); Datum pg_cluster_size(PG_FUNCTION_ARGS) @@ -704,6 +708,76 @@ approximate_working_set_size(PG_FUNCTION_ARGS) PG_RETURN_INT32(dc); } +Datum +neon_get_lfc_stats(PG_FUNCTION_ARGS) +{ +#define NUM_NEON_GET_STATS_COLS 2 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + LfcStatsEntry *entries; + size_t num_entries; + + InitMaterializedSRF(fcinfo, 0); + + /* lfc_get_stats() does all the heavy lifting */ + entries = lfc_get_stats(&num_entries); + + /* Convert the LfcStatsEntrys to a result set */ + for (size_t i = 0; i < num_entries; i++) + { + LfcStatsEntry *entry = &entries[i]; + Datum values[NUM_NEON_GET_STATS_COLS]; + bool nulls[NUM_NEON_GET_STATS_COLS]; + + values[0] = CStringGetTextDatum(entry->metric_name); + nulls[0] = false; + values[1] = Int64GetDatum(entry->isnull ? 0 : entry->value); + nulls[1] = entry->isnull; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } + PG_RETURN_VOID(); + +#undef NUM_NEON_GET_STATS_COLS +} + +Datum +local_cache_pages(PG_FUNCTION_ARGS) +{ +#define NUM_LOCALCACHE_PAGES_COLS 7 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + LocalCachePagesRec *entries; + size_t num_entries; + + InitMaterializedSRF(fcinfo, 0); + + /* lfc_local_cache_pages() does all the heavy lifting */ + entries = lfc_local_cache_pages(&num_entries); + + /* Convert the LocalCachePagesRec structs to a result set */ + for (size_t i = 0; i < num_entries; i++) + { + LocalCachePagesRec *entry = &entries[i]; + Datum values[NUM_LOCALCACHE_PAGES_COLS]; + bool nulls[NUM_LOCALCACHE_PAGES_COLS] = { + false, false, false, false, false, false, false + }; + + values[0] = Int64GetDatum((int64) entry->pageoffs); + values[1] = ObjectIdGetDatum(entry->relfilenode); + values[2] = ObjectIdGetDatum(entry->reltablespace); + values[3] = ObjectIdGetDatum(entry->reldatabase); + values[4] = ObjectIdGetDatum(entry->forknum); + values[5] = Int64GetDatum((int64) entry->blocknum); + values[6] = Int32GetDatum(entry->accesscount); + + /* Build and return the tuple. */ + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } + + PG_RETURN_VOID(); + +#undef NUM_LOCALCACHE_PAGES_COLS +} + /* * Initialization stage 2: make requests for the amount of shared memory we * will need. From dbde37c53aadf66d58b55c1267001a64f720fdac Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 29 Jul 2025 10:20:43 -0400 Subject: [PATCH 08/34] fix(safekeeper): retry if open segment fail (#12757) ## Problem Fix LKB-2632. The safekeeper wal read path does not seem to retry at all. This would cause client read errors on the customer side. ## Summary of changes - Retry on `safekeeper::wal_backup::read_object`. - Note that this only retries on S3 HTTP connection errors. Subsequent reads could fail, and that needs more refactors to make the retry mechanism work across the path. --------- Signed-off-by: Alex Chi Z --- safekeeper/src/wal_backup.rs | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 03c8f7e84a..191f8aacf1 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -12,7 +12,7 @@ use futures::stream::{self, FuturesOrdered}; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; use remote_storage::{ - DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata, + DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata, }; use safekeeper_api::models::PeerInfo; use tokio::fs::File; @@ -607,6 +607,9 @@ pub(crate) async fn copy_partial_segment( storage.copy_object(source, destination, &cancel).await } +const WAL_READ_WARN_THRESHOLD: u32 = 2; +const WAL_READ_MAX_RETRIES: u32 = 3; + pub async fn read_object( storage: &GenericRemoteStorage, file_path: &RemotePath, @@ -620,12 +623,23 @@ pub async fn read_object( byte_start: std::ops::Bound::Included(offset), ..Default::default() }; - let download = storage - .download(file_path, &opts, &cancel) - .await - .with_context(|| { - format!("Failed to open WAL segment download stream for remote path {file_path:?}") - })?; + + // This retry only solves the connect errors: subsequent reads can still fail as this function returns + // a stream. + let download = backoff::retry( + || async { storage.download(file_path, &opts, &cancel).await }, + DownloadError::is_permanent, + WAL_READ_WARN_THRESHOLD, + WAL_READ_MAX_RETRIES, + "download WAL segment", + &cancel, + ) + .await + .ok_or_else(|| DownloadError::Cancelled) + .and_then(|x| x) + .with_context(|| { + format!("Failed to open WAL segment download stream for remote path {file_path:?}") + })?; let reader = tokio_util::io::StreamReader::new(download.download_stream); From 1bb434ab747ae500ef04e9aa671acb4f9eb9d956 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 29 Jul 2025 10:23:42 -0400 Subject: [PATCH 09/34] fix(test): test_readonly_node_gc compute needs time to acquire lease (#12747) ## Problem Part of LKB-2368. Compute fails to obtain LSN lease in this test case. There're many assumptions around how compute obtains the leases, and in this particular test case, as the LSN lease length is only 8s (which is shorter than the amount of time where pageserver can restart and compute can reconnect in terms of force stop), it sometimes cause issues. ## Summary of changes Add more sleeps around the test case to ensure it's stable at least. We need to find a more reliable way to test this in the future. --------- Signed-off-by: Alex Chi Z --- test_runner/regress/test_readonly_node.py | 28 +++++++++++++++++------ 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 5612236250..e151b0ba13 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -129,7 +129,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): Test static endpoint is protected from GC by acquiring and renewing lsn leases. """ - LSN_LEASE_LENGTH = 8 + LSN_LEASE_LENGTH = ( + 14 # This value needs to be large enough for compute_ctl to send two lease requests. + ) + neon_env_builder.num_pageservers = 2 # GC is manual triggered. env = neon_env_builder.init_start( @@ -230,6 +233,15 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): log.info(f"`SELECT` query succeed after GC, {ctx=}") return offset + # It's not reliable to let the compute renew the lease in this test case as we have a very tight + # lease timeout. Therefore, the test case itself will renew the lease. + # + # This is a workaround to make the test case more deterministic. + def renew_lease(env: NeonEnv, lease_lsn: Lsn): + env.storage_controller.pageserver_api().timeline_lsn_lease( + env.initial_tenant, env.initial_timeline, lease_lsn + ) + # Insert some records on main branch with env.endpoints.create_start("main", config_lines=["shared_buffers=1MB"]) as ep_main: with ep_main.cursor() as cur: @@ -242,6 +254,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): XLOG_BLCKSZ = 8192 lsn = Lsn((int(lsn) // XLOG_BLCKSZ) * XLOG_BLCKSZ) + # We need to mock the way cplane works: it gets a lease for a branch before starting the compute. + renew_lease(env, lsn) + with env.endpoints.create_start( branch_name="main", endpoint_id="static", @@ -251,9 +266,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): cur.execute("SELECT count(*) FROM t0") assert cur.fetchone() == (ROW_COUNT,) - # Wait for static compute to renew lease at least once. - time.sleep(LSN_LEASE_LENGTH / 2) - generate_updates_on_main(env, ep_main, 3, end=100) offset = trigger_gc_and_select( @@ -263,10 +275,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): # Trigger Pageserver restarts for ps in env.pageservers: ps.stop() - # Static compute should have at least one lease request failure due to connection. - time.sleep(LSN_LEASE_LENGTH / 2) ps.start() + renew_lease(env, lsn) + trigger_gc_and_select( env, ep_static, @@ -282,6 +294,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): ) env.storage_controller.reconcile_until_idle() + # Wait for static compute to renew lease on the new pageserver. + time.sleep(LSN_LEASE_LENGTH + 3) + trigger_gc_and_select( env, ep_static, @@ -292,7 +307,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): # Do some update so we can increment gc_cutoff generate_updates_on_main(env, ep_main, i, end=100) - # Wait for the existing lease to expire. time.sleep(LSN_LEASE_LENGTH + 1) # Now trigger GC again, layers should be removed. From 62d844e657b99dc5374b42333cb5413160bd7bec Mon Sep 17 00:00:00 2001 From: HaoyuHuang Date: Tue, 29 Jul 2025 08:22:04 -0700 Subject: [PATCH 10/34] Add changes in spec apply (#12759) ## Problem All changes are no-op. ## Summary of changes --- compute_tools/src/compute.rs | 63 +++++- compute_tools/src/spec_apply.rs | 205 +++++++++++++++++- .../alter_databricks_reader_roles_timeout.sql | 25 +++ .../src/sql/create_databricks_misc.sql | 15 ++ 4 files changed, 296 insertions(+), 12 deletions(-) create mode 100644 compute_tools/src/sql/alter_databricks_reader_roles_timeout.sql create mode 100644 compute_tools/src/sql/create_databricks_misc.sql diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index f53adbb1df..8350a4c059 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -6,7 +6,8 @@ use compute_api::responses::{ LfcPrewarmState, PromoteState, TlsConfig, }; use compute_api::spec::{ - ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent, + ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, GenericOption, + PageserverProtocol, PgIdent, Role, }; use futures::StreamExt; use futures::future::join_all; @@ -413,6 +414,66 @@ struct StartVmMonitorResult { vm_monitor: Option>>, } +// BEGIN_HADRON +/// This function creates roles that are used by Databricks. +/// These roles are not needs to be botostrapped at PG Compute provisioning time. +/// The auth method for these roles are configured in databricks_pg_hba.conf in universe repository. +pub(crate) fn create_databricks_roles() -> Vec { + let roles = vec![ + // Role for prometheus_stats_exporter + Role { + name: "databricks_monitor".to_string(), + // This uses "local" connection and auth method for that is "trust", so no password is needed. + encrypted_password: None, + options: Some(vec![GenericOption { + name: "IN ROLE pg_monitor".to_string(), + value: None, + vartype: "string".to_string(), + }]), + }, + // Role for brickstore control plane + Role { + name: "databricks_control_plane".to_string(), + // Certificate user does not need password. + encrypted_password: None, + options: Some(vec![GenericOption { + name: "SUPERUSER".to_string(), + value: None, + vartype: "string".to_string(), + }]), + }, + // Role for brickstore httpgateway. + Role { + name: "databricks_gateway".to_string(), + // Certificate user does not need password. + encrypted_password: None, + options: None, + }, + ]; + + roles + .into_iter() + .map(|role| { + let query = format!( + r#" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT FROM pg_catalog.pg_roles WHERE rolname = '{}') + THEN + CREATE ROLE {} {}; + END IF; + END + $$;"#, + role.name, + role.name.pg_quote(), + role.to_pg_options(), + ); + query + }) + .collect() +} + /// Databricks-specific environment variables to be passed to the `postgres` sub-process. pub struct DatabricksEnvVars { /// The Databricks "endpoint ID" of the compute instance. Used by `postgres` to check diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 47bf61ae1b..2356078703 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -13,17 +13,19 @@ use tokio_postgres::Client; use tokio_postgres::error::SqlState; use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; -use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState}; +use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState, create_databricks_roles}; +use crate::hadron_metrics::COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS; use crate::pg_helpers::{ DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async, get_existing_roles_async, }; use crate::spec_apply::ApplySpecPhase::{ - CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension, + AddDatabricksGrants, AlterDatabricksRoles, CreateAndAlterDatabases, CreateAndAlterRoles, + CreateAvailabilityCheck, CreateDatabricksMisc, CreateDatabricksRoles, CreatePgauditExtension, CreatePgauditlogtofileExtension, CreatePrivilegedRole, CreateSchemaNeon, DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, - HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, - RunInEachDatabase, + HandleDatabricksAuthExtension, HandleNeonExtension, HandleOtherExtensions, + RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase, }; use crate::spec_apply::PerDatabasePhase::{ ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, @@ -166,6 +168,7 @@ impl ComputeNode { concurrency_token.clone(), db, [DropLogicalSubscriptions].to_vec(), + self.params.lakebase_mode, ); Ok(tokio::spawn(fut)) @@ -186,15 +189,33 @@ impl ComputeNode { }; } - for phase in [ - CreatePrivilegedRole, + let phases = if self.params.lakebase_mode { + vec![ + CreatePrivilegedRole, + // BEGIN_HADRON + CreateDatabricksRoles, + AlterDatabricksRoles, + // END_HADRON DropInvalidDatabases, RenameRoles, CreateAndAlterRoles, RenameAndDeleteDatabases, CreateAndAlterDatabases, CreateSchemaNeon, - ] { + ] + } else { + vec![ + CreatePrivilegedRole, + DropInvalidDatabases, + RenameRoles, + CreateAndAlterRoles, + RenameAndDeleteDatabases, + CreateAndAlterDatabases, + CreateSchemaNeon, + ] + }; + + for phase in phases { info!("Applying phase {:?}", &phase); apply_operations( params.clone(), @@ -203,6 +224,7 @@ impl ComputeNode { jwks_roles.clone(), phase, || async { Ok(&client) }, + self.params.lakebase_mode, ) .await?; } @@ -254,6 +276,7 @@ impl ComputeNode { concurrency_token.clone(), db, phases, + self.params.lakebase_mode, ); Ok(tokio::spawn(fut)) @@ -265,12 +288,28 @@ impl ComputeNode { handle.await??; } - let mut phases = vec![ + let mut phases = if self.params.lakebase_mode { + vec![ + HandleOtherExtensions, + HandleNeonExtension, // This step depends on CreateSchemaNeon + // BEGIN_HADRON + HandleDatabricksAuthExtension, + // END_HADRON + CreateAvailabilityCheck, + DropRoles, + // BEGIN_HADRON + AddDatabricksGrants, + CreateDatabricksMisc, + // END_HADRON + ] + } else { + vec![ HandleOtherExtensions, HandleNeonExtension, // This step depends on CreateSchemaNeon CreateAvailabilityCheck, DropRoles, - ]; + ] + }; // This step depends on CreateSchemaNeon if spec.drop_subscriptions_before_start && !drop_subscriptions_done { @@ -303,6 +342,7 @@ impl ComputeNode { jwks_roles.clone(), phase, || async { Ok(&client) }, + self.params.lakebase_mode, ) .await?; } @@ -328,6 +368,7 @@ impl ComputeNode { concurrency_token: Arc, db: DB, subphases: Vec, + lakebase_mode: bool, ) -> Result<()> { let _permit = concurrency_token.acquire().await?; @@ -355,6 +396,7 @@ impl ComputeNode { let client = client_conn.as_ref().unwrap(); Ok(client) }, + lakebase_mode, ) .await?; } @@ -477,6 +519,10 @@ pub enum PerDatabasePhase { #[derive(Clone, Debug)] pub enum ApplySpecPhase { CreatePrivilegedRole, + // BEGIN_HADRON + CreateDatabricksRoles, + AlterDatabricksRoles, + // END_HADRON DropInvalidDatabases, RenameRoles, CreateAndAlterRoles, @@ -489,7 +535,14 @@ pub enum ApplySpecPhase { DisablePostgresDBPgAudit, HandleOtherExtensions, HandleNeonExtension, + // BEGIN_HADRON + HandleDatabricksAuthExtension, + // END_HADRON CreateAvailabilityCheck, + // BEGIN_HADRON + AddDatabricksGrants, + CreateDatabricksMisc, + // END_HADRON DropRoles, FinalizeDropLogicalSubscriptions, } @@ -525,6 +578,7 @@ pub async fn apply_operations<'a, Fut, F>( jwks_roles: Arc>, apply_spec_phase: ApplySpecPhase, client: F, + lakebase_mode: bool, ) -> Result<()> where F: FnOnce() -> Fut, @@ -571,6 +625,23 @@ where }, query ); + if !lakebase_mode { + return res; + } + // BEGIN HADRON + if let Err(e) = res.as_ref() { + if let Some(sql_state) = e.code() { + if sql_state.code() == "57014" { + // SQL State 57014 (ERRCODE_QUERY_CANCELED) is used for statement timeouts. + // Increment the counter whenever a statement timeout occurs. Timeouts on + // this configuration path can only occur due to PS connectivity problems that + // Postgres failed to recover from. + COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS.inc(); + } + } + } + // END HADRON + res } .instrument(inspan) @@ -612,6 +683,35 @@ async fn get_operations<'a>( ), comment: None, }))), + // BEGIN_HADRON + // New Hadron phase + ApplySpecPhase::CreateDatabricksRoles => { + let queries = create_databricks_roles(); + let operations = queries.into_iter().map(|query| Operation { + query, + comment: None, + }); + Ok(Box::new(operations)) + } + + // Backfill existing databricks_reader_* roles with statement timeout from GUC + ApplySpecPhase::AlterDatabricksRoles => { + let query = String::from(include_str!( + "sql/alter_databricks_reader_roles_timeout.sql" + )); + + let operations = once(Operation { + query, + comment: Some( + "Backfill existing databricks_reader_* roles with statement timeout" + .to_string(), + ), + }); + + Ok(Box::new(operations)) + } + // End of new Hadron Phase + // END_HADRON ApplySpecPhase::DropInvalidDatabases => { let mut ctx = ctx.write().await; let databases = &mut ctx.dbs; @@ -981,7 +1081,10 @@ async fn get_operations<'a>( // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` role_name = escaped_role, outer_tag = outer_tag, - ), + ) + // HADRON change: + .replace("neon_superuser", ¶ms.privileged_role_name), + // HADRON change end , comment: None, }, // This now will only drop privileges of the role @@ -1017,7 +1120,8 @@ async fn get_operations<'a>( comment: None, }, Operation { - query: String::from(include_str!("sql/default_grants.sql")), + query: String::from(include_str!("sql/default_grants.sql")) + .replace("neon_superuser", ¶ms.privileged_role_name), comment: None, }, ] @@ -1086,6 +1190,28 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } + // BEGIN_HADRON + // Note: we may want to version the extension someday, but for now we just drop it and recreate it. + ApplySpecPhase::HandleDatabricksAuthExtension => { + let operations = vec![ + Operation { + query: String::from("DROP EXTENSION IF EXISTS databricks_auth"), + comment: Some(String::from("dropping existing databricks_auth extension")), + }, + Operation { + query: String::from("CREATE EXTENSION databricks_auth"), + comment: Some(String::from("creating databricks_auth extension")), + }, + Operation { + query: String::from("GRANT SELECT ON databricks_auth_metrics TO pg_monitor"), + comment: Some(String::from("grant select on databricks auth counters")), + }, + ] + .into_iter(); + + Ok(Box::new(operations)) + } + // END_HADRON ApplySpecPhase::CreateAvailabilityCheck => Ok(Box::new(once(Operation { query: String::from(include_str!("sql/add_availabilitycheck_tables.sql")), comment: None, @@ -1103,6 +1229,63 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } + + // BEGIN_HADRON + // New Hadron phases + // + // Grants permissions to roles that are used by Databricks. + ApplySpecPhase::AddDatabricksGrants => { + let operations = vec![ + Operation { + query: String::from("GRANT USAGE ON SCHEMA neon TO databricks_monitor"), + comment: Some(String::from( + "Permissions needed to execute neon.* functions (in the postgres database)", + )), + }, + Operation { + query: String::from( + "GRANT SELECT, INSERT, UPDATE ON health_check TO databricks_monitor", + ), + comment: Some(String::from("Permissions needed for read and write probes")), + }, + Operation { + query: String::from( + "GRANT EXECUTE ON FUNCTION pg_ls_dir(text) TO databricks_monitor", + ), + comment: Some(String::from( + "Permissions needed to monitor .snap file counts", + )), + }, + Operation { + query: String::from( + "GRANT SELECT ON neon.neon_perf_counters TO databricks_monitor", + ), + comment: Some(String::from( + "Permissions needed to access neon performance counters view", + )), + }, + Operation { + query: String::from( + "GRANT EXECUTE ON FUNCTION neon.get_perf_counters() TO databricks_monitor", + ), + comment: Some(String::from( + "Permissions needed to execute the underlying performance counters function", + )), + }, + ] + .into_iter(); + + Ok(Box::new(operations)) + } + // Creates minor objects that are used by Databricks. + ApplySpecPhase::CreateDatabricksMisc => Ok(Box::new(once(Operation { + query: String::from(include_str!("sql/create_databricks_misc.sql")), + comment: Some(String::from( + "The function databricks_monitor uses to convert exception to 0 or 1", + )), + }))), + // End of new Hadron phases + // END_HADRON ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation { query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")), comment: None, diff --git a/compute_tools/src/sql/alter_databricks_reader_roles_timeout.sql b/compute_tools/src/sql/alter_databricks_reader_roles_timeout.sql new file mode 100644 index 0000000000..db16df3817 --- /dev/null +++ b/compute_tools/src/sql/alter_databricks_reader_roles_timeout.sql @@ -0,0 +1,25 @@ +DO $$ +DECLARE + reader_role RECORD; + timeout_value TEXT; +BEGIN + -- Get the current GUC setting for reader statement timeout + SELECT current_setting('databricks.reader_statement_timeout', true) INTO timeout_value; + + -- Only proceed if timeout_value is not null/empty and not '0' (disabled) + IF timeout_value IS NOT NULL AND timeout_value != '' AND timeout_value != '0' THEN + -- Find all databricks_reader_* roles and update their statement_timeout + FOR reader_role IN + SELECT r.rolname + FROM pg_roles r + WHERE r.rolname ~ '^databricks_reader_\d+$' + LOOP + -- Apply the timeout setting to the role (will overwrite existing setting) + EXECUTE format('ALTER ROLE %I SET statement_timeout = %L', + reader_role.rolname, timeout_value); + + RAISE LOG 'Updated statement_timeout = % for role %', timeout_value, reader_role.rolname; + END LOOP; + END IF; +END +$$; diff --git a/compute_tools/src/sql/create_databricks_misc.sql b/compute_tools/src/sql/create_databricks_misc.sql new file mode 100644 index 0000000000..a6dc379078 --- /dev/null +++ b/compute_tools/src/sql/create_databricks_misc.sql @@ -0,0 +1,15 @@ +ALTER ROLE databricks_monitor SET statement_timeout = '60s'; + +CREATE OR REPLACE FUNCTION health_check_write_succeeds() +RETURNS INTEGER AS $$ +BEGIN +INSERT INTO health_check VALUES (1, now()) +ON CONFLICT (id) DO UPDATE + SET updated_at = now(); + +RETURN 1; +EXCEPTION WHEN OTHERS THEN +RAISE EXCEPTION '[DATABRICKS_SMGR] health_check failed: [%] %', SQLSTATE, SQLERRM; +RETURN 0; +END; +$$ LANGUAGE plpgsql; From 0ffdc98e202f6573bdc07d78834e5c5f6423bf69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Szafra=C5=84ski?= Date: Tue, 29 Jul 2025 17:25:22 +0200 Subject: [PATCH 11/34] [proxy] Classify "database not found" errors as user errors (#12603) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem If a user provides a wrong database name in the connection string, it should be logged as a user error, not postgres error. I found 4 different places where we log such errors: 1. `proxy/src/stream.rs:193`, e.g.: ``` {"timestamp":"2025-07-15T11:33:35.660026Z","level":"INFO","message":"forwarding error to user","fields":{"kind":"postgres","msg":"database \"[redacted]\" does not exist"},"spans":{"connect_request#9":{"protocol":"tcp","session_id":"ce1f2c90-dfb5-44f7-b9e9-8b8535e8b9b8","conn_info":"[redacted]","ep":"[redacted]","role":"[redacted]"}},"thread_id":22,"task_id":"370407867","target":"proxy::stream","src":"proxy/src/stream.rs:193","extract":{"ep":"[redacted]","session_id":"ce1f2c90-dfb5-44f7-b9e9-8b8535e8b9b8"}} ``` 2. `proxy/src/pglb/mod.rs:137`, e.g.: ``` {"timestamp":"2025-07-15T11:37:44.340497Z","level":"WARN","message":"per-client task finished with an error: Couldn't connect to compute node: db error: FATAL: database \"[redacted]\" does not exist","spans":{"connect_request#8":{"protocol":"tcp","session_id":"763baaac-d039-4f4d-9446-c149e32660eb","conn_info":"[redacted]","ep":"[redacted]","role":"[redacted]"}},"thread_id":14,"task_id":"866658139","target":"proxy::pglb","src":"proxy/src/pglb/mod.rs:137","extract":{"ep":"[redacted]","session_id":"763baaac-d039-4f4d-9446-c149e32660eb"}} ``` 3. `proxy/src/serverless/mod.rs:451`, e.g. (note that the error is repeated 4 times — retries?): ``` {"timestamp":"2025-07-15T11:37:54.515891Z","level":"WARN","message":"error in websocket connection: Couldn't connect to compute node: db error: FATAL: database \"[redacted]\" does not exist: Couldn't connect to compute node: db error: FATAL: database \"[redacted]\" does not exist: db error: FATAL: database \"[redacted]\" does not exist: FATAL: database \"[redacted]\" does not exist","spans":{"http_conn#8":{"conn_id":"ec7780db-a145-4f0e-90df-0ba35f41b828"},"connect_request#9":{"protocol":"ws","session_id":"1eaaeeec-b671-4153-b1f4-247839e4b1c7","conn_info":"[redacted]","ep":"[redacted]","role":"[redacted]"}},"thread_id":10,"task_id":"366331699","target":"proxy::serverless","src":"proxy/src/serverless/mod.rs:451","extract":{"conn_id":"ec7780db-a145-4f0e-90df-0ba35f41b828","ep":"[redacted]","session_id":"1eaaeeec-b671-4153-b1f4-247839e4b1c7"}} ``` 4. `proxy/src/serverless/sql_over_http.rs:219`, e.g. ``` {"timestamp":"2025-07-15T10:32:34.866603Z","level":"INFO","message":"forwarding error to user","fields":{"kind":"postgres","error":"could not connect to postgres in compute","msg":"database \"[redacted]\" does not exist"},"spans":{"http_conn#19":{"conn_id":"7da08203-5dab-45e8-809f-503c9019ec6b"},"connect_request#5":{"protocol":"http","session_id":"68387f1c-cbc8-45b3-a7db-8bb1c55ca809","conn_info":"[redacted]","ep":"[redacted]","role":"[redacted]"}},"thread_id":17,"task_id":"16432250","target":"proxy::serverless::sql_over_http","src":"proxy/src/serverless/sql_over_http.rs:219","extract":{"conn_id":"7da08203-5dab-45e8-809f-503c9019ec6b","ep":"[redacted]","session_id":"68387f1c-cbc8-45b3-a7db-8bb1c55ca809"}} ``` This PR directly addresses 1 and 4. I _think_ it _should_ also help with 2 and 3, although in those places we don't seem to log `kind`, so I'm not quite sure. I'm also confused why in 3 the error is repeated multiple times. ## Summary of changes Resolves https://github.com/neondatabase/neon/issues/9440 --- libs/proxy/tokio-postgres2/src/error/mod.rs | 2 +- proxy/src/compute/mod.rs | 18 +++++----- proxy/src/serverless/backend.rs | 18 +++++----- proxy/src/serverless/sql_over_http.rs | 39 +++++++++------------ 4 files changed, 37 insertions(+), 40 deletions(-) diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs index 6e68b1e595..3fbb97f9bb 100644 --- a/libs/proxy/tokio-postgres2/src/error/mod.rs +++ b/libs/proxy/tokio-postgres2/src/error/mod.rs @@ -9,7 +9,7 @@ use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody}; pub use self::sqlstate::*; #[allow(clippy::unreadable_literal)] -mod sqlstate; +pub mod sqlstate; /// The severity of a Postgres error or notice. #[derive(Debug, Copy, Clone, PartialEq, Eq)] diff --git a/proxy/src/compute/mod.rs b/proxy/src/compute/mod.rs index ca784423ee..43cfe70206 100644 --- a/proxy/src/compute/mod.rs +++ b/proxy/src/compute/mod.rs @@ -8,6 +8,7 @@ use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use postgres_client::config::{AuthKeys, ChannelBinding, SslMode}; use postgres_client::connect_raw::StartupStream; +use postgres_client::error::SqlState; use postgres_client::maybe_tls_stream::MaybeTlsStream; use postgres_client::tls::MakeTlsConnect; use thiserror::Error; @@ -22,7 +23,7 @@ use crate::context::RequestContext; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::MetricsAuxInfo; -use crate::error::{ReportableError, UserFacingError}; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumDbConnectionsGuard}; use crate::pqproto::StartupMessageParams; use crate::proxy::connect_compute::TlsNegotiation; @@ -65,12 +66,13 @@ impl UserFacingError for PostgresError { } impl ReportableError for PostgresError { - fn get_error_kind(&self) -> crate::error::ErrorKind { + fn get_error_kind(&self) -> ErrorKind { match self { - PostgresError::Postgres(e) if e.as_db_error().is_some() => { - crate::error::ErrorKind::Postgres - } - PostgresError::Postgres(_) => crate::error::ErrorKind::Compute, + PostgresError::Postgres(err) => match err.as_db_error() { + Some(err) if err.code() == &SqlState::INVALID_CATALOG_NAME => ErrorKind::User, + Some(_) => ErrorKind::Postgres, + None => ErrorKind::Compute, + }, } } } @@ -110,9 +112,9 @@ impl UserFacingError for ConnectionError { } impl ReportableError for ConnectionError { - fn get_error_kind(&self) -> crate::error::ErrorKind { + fn get_error_kind(&self) -> ErrorKind { match self { - ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, + ConnectionError::TlsError(_) => ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), #[cfg(test)] diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index eb879f98e7..511bdc4e42 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -4,6 +4,7 @@ use std::time::Duration; use ed25519_dalek::SigningKey; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; use jose_jwk::jose_b64; +use postgres_client::error::SqlState; use postgres_client::maybe_tls_stream::MaybeTlsStream; use rand_core::OsRng; use tracing::field::display; @@ -459,15 +460,14 @@ impl ReportableError for HttpConnError { match self { HttpConnError::ConnectError(_) => ErrorKind::Compute, HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, - HttpConnError::PostgresConnectionError(p) => { - if p.as_db_error().is_some() { - // postgres rejected the connection - ErrorKind::Postgres - } else { - // couldn't even reach postgres - ErrorKind::Compute - } - } + HttpConnError::PostgresConnectionError(p) => match p.as_db_error() { + // user provided a wrong database name + Some(err) if err.code() == &SqlState::INVALID_CATALOG_NAME => ErrorKind::User, + // postgres rejected the connection + Some(_) => ErrorKind::Postgres, + // couldn't even reach postgres + None => ErrorKind::Compute, + }, HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute, HttpConnError::ComputeCtl(_) => ErrorKind::Service, HttpConnError::JwtPayloadError(_) => ErrorKind::User, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 26f65379e7..c334e820d7 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -192,34 +192,29 @@ pub(crate) async fn handle( let line = get(db_error, |db| db.line().map(|l| l.to_string())); let routine = get(db_error, |db| db.routine()); - match &e { - SqlOverHttpError::Postgres(e) - if e.as_db_error().is_some() && error_kind == ErrorKind::User => - { - // this error contains too much info, and it's not an error we care about. - if tracing::enabled!(Level::DEBUG) { - tracing::debug!( - kind=error_kind.to_metric_label(), - error=%e, - msg=message, - "forwarding error to user" - ); - } else { - tracing::info!( - kind = error_kind.to_metric_label(), - error = "bad query", - "forwarding error to user" - ); - } - } - _ => { - tracing::info!( + if db_error.is_some() && error_kind == ErrorKind::User { + // this error contains too much info, and it's not an error we care about. + if tracing::enabled!(Level::DEBUG) { + debug!( kind=error_kind.to_metric_label(), error=%e, msg=message, "forwarding error to user" ); + } else { + info!( + kind = error_kind.to_metric_label(), + error = "bad query", + "forwarding error to user" + ); } + } else { + info!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); } json_response( From 5585c32ceebb498b0d0085fc7d1306199e08264e Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Tue, 29 Jul 2025 17:34:02 +0200 Subject: [PATCH 12/34] Disable autovacuum while running pg_repack test (#12755) ## Problem Sometimes, the regression test of `pg_repack` fails due to an extra line in the output. The most probable cause of this is autovacuum. https://databricks.atlassian.net/browse/LKB-2637 ## Summary of changes Autovacuum is disabled during the test. Co-authored-by: Alexey Masterov --- compute/patches/pg_repack.patch | 68 ++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 18 deletions(-) diff --git a/compute/patches/pg_repack.patch b/compute/patches/pg_repack.patch index 10ed1054ff..b8a057e222 100644 --- a/compute/patches/pg_repack.patch +++ b/compute/patches/pg_repack.patch @@ -1,5 +1,11 @@ +commit 5eb393810cf7c7bafa4e394dad2e349e2a8cb2cb +Author: Alexey Masterov +Date: Mon Jul 28 18:11:02 2025 +0200 + + Patch for pg_repack + diff --git a/regress/Makefile b/regress/Makefile -index bf6edcb..89b4c7f 100644 +index bf6edcb..110e734 100644 --- a/regress/Makefile +++ b/regress/Makefile @@ -17,7 +17,7 @@ INTVERSION := $(shell echo $$(($$(echo $(VERSION).0 | sed 's/\([[:digit:]]\{1,\} @@ -7,18 +13,36 @@ index bf6edcb..89b4c7f 100644 # -REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper tablespace get_order_by trigger -+REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger ++REGRESS := init-extension noautovacuum repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger autovacuum USE_PGXS = 1 # use pgxs if not in contrib directory PGXS := $(shell $(PG_CONFIG) --pgxs) -diff --git a/regress/expected/init-extension.out b/regress/expected/init-extension.out -index 9f2e171..f6e4f8d 100644 ---- a/regress/expected/init-extension.out -+++ b/regress/expected/init-extension.out -@@ -1,3 +1,2 @@ - SET client_min_messages = warning; - CREATE EXTENSION pg_repack; --RESET client_min_messages; +diff --git a/regress/expected/autovacuum.out b/regress/expected/autovacuum.out +new file mode 100644 +index 0000000..e7f2363 +--- /dev/null ++++ b/regress/expected/autovacuum.out +@@ -0,0 +1,7 @@ ++ALTER SYSTEM SET autovacuum='on'; ++SELECT pg_reload_conf(); ++ pg_reload_conf ++---------------- ++ t ++(1 row) ++ +diff --git a/regress/expected/noautovacuum.out b/regress/expected/noautovacuum.out +new file mode 100644 +index 0000000..fc7978e +--- /dev/null ++++ b/regress/expected/noautovacuum.out +@@ -0,0 +1,7 @@ ++ALTER SYSTEM SET autovacuum='off'; ++SELECT pg_reload_conf(); ++ pg_reload_conf ++---------------- ++ t ++(1 row) ++ diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out index 8d0a94e..63b68bf 100644 --- a/regress/expected/nosuper.out @@ -50,14 +74,22 @@ index 8d0a94e..63b68bf 100644 INFO: repacking table "public.tbl_cluster" ERROR: query failed: ERROR: current transaction is aborted, commands ignored until end of transaction block DETAIL: query was: RESET lock_timeout -diff --git a/regress/sql/init-extension.sql b/regress/sql/init-extension.sql -index 9f2e171..f6e4f8d 100644 ---- a/regress/sql/init-extension.sql -+++ b/regress/sql/init-extension.sql -@@ -1,3 +1,2 @@ - SET client_min_messages = warning; - CREATE EXTENSION pg_repack; --RESET client_min_messages; +diff --git a/regress/sql/autovacuum.sql b/regress/sql/autovacuum.sql +new file mode 100644 +index 0000000..a8eda63 +--- /dev/null ++++ b/regress/sql/autovacuum.sql +@@ -0,0 +1,2 @@ ++ALTER SYSTEM SET autovacuum='on'; ++SELECT pg_reload_conf(); +diff --git a/regress/sql/noautovacuum.sql b/regress/sql/noautovacuum.sql +new file mode 100644 +index 0000000..13d4836 +--- /dev/null ++++ b/regress/sql/noautovacuum.sql +@@ -0,0 +1,2 @@ ++ALTER SYSTEM SET autovacuum='off'; ++SELECT pg_reload_conf(); diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql index 072f0fa..dbe60f8 100644 --- a/regress/sql/nosuper.sql From bb32f1b3d05fbac6ada499f1c5ec788e6adbb9c1 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 29 Jul 2025 18:35:00 +0300 Subject: [PATCH 13/34] Move 'criterion' to a dev-dependency (#12762) It is only used in micro-benchmarks. --- libs/postgres_ffi/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 5adf38f457..23fabeccd2 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -9,7 +9,6 @@ regex.workspace = true bytes.workspace = true anyhow.workspace = true crc32c.workspace = true -criterion.workspace = true once_cell.workspace = true pprof.workspace = true thiserror.workspace = true @@ -20,6 +19,7 @@ tracing.workspace = true postgres_versioninfo.workspace = true [dev-dependencies] +criterion.workspace = true env_logger.workspace = true postgres.workspace = true From 16eb8dda3de9e2b6bc35cc150046cce936aa1746 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Tue, 29 Jul 2025 12:01:56 -0400 Subject: [PATCH 14/34] some compute ctl changes from hadron (#12760) Some compute ctl changes from hadron --- compute_tools/src/bin/compute_ctl.rs | 42 ++++++++++++-- compute_tools/src/compute.rs | 84 +++++++++++++++++++++++----- 2 files changed, 109 insertions(+), 17 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 9c86aba531..2b4802f309 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -82,6 +82,15 @@ struct Cli { #[arg(long, default_value_t = 3081)] pub internal_http_port: u16, + /// Backwards-compatible --http-port for Hadron deployments. Functionally the + /// same as --external-http-port. + #[arg( + long, + conflicts_with = "external_http_port", + conflicts_with = "internal_http_port" + )] + pub http_port: Option, + #[arg(short = 'D', long, value_name = "DATADIR")] pub pgdata: String, @@ -181,6 +190,26 @@ impl Cli { } } +// Hadron helpers to get compatible compute_ctl http ports from Cli. The old `--http-port` +// arg is used and acts the same as `--external-http-port`. The internal http port is defined +// to be http_port + 1. Hadron runs in the dblet environment which uses the host network, so +// we need to be careful with the ports to choose. +fn get_external_http_port(cli: &Cli) -> u16 { + if cli.lakebase_mode { + return cli.http_port.unwrap_or(cli.external_http_port); + } + cli.external_http_port +} +fn get_internal_http_port(cli: &Cli) -> u16 { + if cli.lakebase_mode { + return cli + .http_port + .map(|p| p + 1) + .unwrap_or(cli.internal_http_port); + } + cli.internal_http_port +} + fn main() -> Result<()> { let cli = Cli::parse(); @@ -205,13 +234,18 @@ fn main() -> Result<()> { // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; - installed_extensions::initialize_metrics(); - hadron_metrics::initialize_metrics(); + if cli.lakebase_mode { + installed_extensions::initialize_metrics(); + hadron_metrics::initialize_metrics(); + } let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; let config = get_config(&cli)?; + let external_http_port = get_external_http_port(&cli); + let internal_http_port = get_internal_http_port(&cli); + let compute_node = ComputeNode::new( ComputeNodeParams { compute_id: cli.compute_id, @@ -220,8 +254,8 @@ fn main() -> Result<()> { pgdata: cli.pgdata.clone(), pgbin: cli.pgbin.clone(), pgversion: get_pg_version_string(&cli.pgbin), - external_http_port: cli.external_http_port, - internal_http_port: cli.internal_http_port, + external_http_port, + internal_http_port, remote_ext_base_url: cli.remote_ext_base_url.clone(), resize_swap_on_bind: cli.resize_swap_on_bind, set_disk_quota_for_fs: cli.set_disk_quota_for_fs, diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 8350a4c059..27d33d8cd8 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -482,14 +482,27 @@ pub struct DatabricksEnvVars { /// Hostname of the Databricks workspace URL this compute instance belongs to. /// Used by postgres to verify Databricks PAT tokens. pub workspace_host: String, + + pub lakebase_mode: bool, } impl DatabricksEnvVars { - pub fn new(compute_spec: &ComputeSpec, compute_id: Option<&String>) -> Self { - // compute_id is a string format of "{endpoint_id}/{compute_idx}" - // endpoint_id is a uuid. We only need to pass down endpoint_id to postgres. - // Panics if compute_id is not set or not in the expected format. - let endpoint_id = compute_id.unwrap().split('/').next().unwrap().to_string(); + pub fn new( + compute_spec: &ComputeSpec, + compute_id: Option<&String>, + instance_id: Option, + lakebase_mode: bool, + ) -> Self { + let endpoint_id = if let Some(instance_id) = instance_id { + // Use instance_id as endpoint_id if it is set. This code path is for PuPr model. + instance_id + } else { + // Use compute_id as endpoint_id if instance_id is not set. The code path is for PrPr model. + // compute_id is a string format of "{endpoint_id}/{compute_idx}" + // endpoint_id is a uuid. We only need to pass down endpoint_id to postgres. + // Panics if compute_id is not set or not in the expected format. + compute_id.unwrap().split('/').next().unwrap().to_string() + }; let workspace_host = compute_spec .databricks_settings .as_ref() @@ -498,6 +511,7 @@ impl DatabricksEnvVars { Self { endpoint_id, workspace_host, + lakebase_mode, } } @@ -507,6 +521,10 @@ impl DatabricksEnvVars { /// Convert DatabricksEnvVars to a list of string pairs that can be passed as env vars. Consumes `self`. pub fn to_env_var_list(self) -> Vec<(String, String)> { + if !self.lakebase_mode { + // In neon env, we don't need to pass down the env vars to postgres. + return vec![]; + } vec![ ( Self::DATABRICKS_ENDPOINT_ID_ENVVAR.to_string(), @@ -556,7 +574,11 @@ impl ComputeNode { let mut new_state = ComputeState::new(); if let Some(spec) = config.spec { let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; - new_state.pspec = Some(pspec); + if params.lakebase_mode { + ComputeNode::set_spec(¶ms, &mut new_state, pspec); + } else { + new_state.pspec = Some(pspec); + } } Ok(ComputeNode { @@ -1154,7 +1176,14 @@ impl ComputeNode { // If it is something different then create_dir() will error out anyway. let pgdata = &self.params.pgdata; let _ok = fs::remove_dir_all(pgdata); - fs::create_dir(pgdata)?; + if self.params.lakebase_mode { + // Ignore creation errors if the directory already exists (e.g. mounting it ahead of time). + // If it is something different then PG startup will error out anyway. + let _ok = fs::create_dir(pgdata); + } else { + fs::create_dir(pgdata)?; + } + fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?; Ok(()) @@ -1633,7 +1662,7 @@ impl ComputeNode { // symlink doesn't affect anything. // // See https://github.com/neondatabase/autoscaling/issues/800 - std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?; + std::fs::remove_dir_all(pgdata_path.join("pg_dynshmem"))?; symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?; match spec.mode { @@ -1648,6 +1677,12 @@ impl ComputeNode { /// Start and stop a postgres process to warm up the VM for startup. pub fn prewarm_postgres_vm_memory(&self) -> Result<()> { + if self.params.lakebase_mode { + // We are running in Hadron mode. Disabling this prewarming step for now as it could run + // into dblet port conflicts and also doesn't add much value with our current infra. + info!("Skipping postgres prewarming in Hadron mode"); + return Ok(()); + } info!("prewarming VM memory"); // Create pgdata @@ -1709,7 +1744,12 @@ impl ComputeNode { let databricks_env_vars = { let state = self.state.lock().unwrap(); let spec = &state.pspec.as_ref().unwrap().spec; - DatabricksEnvVars::new(spec, Some(&self.params.compute_id)) + DatabricksEnvVars::new( + spec, + Some(&self.params.compute_id), + self.params.instance_id.clone(), + self.params.lakebase_mode, + ) }; info!( @@ -1881,7 +1921,15 @@ impl ComputeNode { /// Do initial configuration of the already started Postgres. #[instrument(skip_all)] pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { - let conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config")); + let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config")); + + if self.params.lakebase_mode { + // Set a 2-minute statement_timeout for the session applying config. The individual SQL statements + // used in apply_spec_sql() should not take long (they are just creating users and installing + // extensions). If any of them are stuck for an extended period of time it usually indicates a + // pageserver connectivity problem and we should bail out. + conf.options("-c statement_timeout=2min"); + } let conf = Arc::new(conf); let spec = Arc::new( @@ -2199,7 +2247,17 @@ impl ComputeNode { pub fn check_for_core_dumps(&self) -> Result<()> { let core_dump_dir = match std::env::consts::OS { "macos" => Path::new("/cores/"), - _ => Path::new(&self.params.pgdata), + // BEGIN HADRON + // NB: Read core dump files from a fixed location outside of + // the data directory since `compute_ctl` wipes the data directory + // across container restarts. + _ => { + if self.params.lakebase_mode { + Path::new("/databricks/logs/brickstore") + } else { + Path::new(&self.params.pgdata) + } + } // END HADRON }; // Collect core dump paths if any @@ -2512,7 +2570,7 @@ LIMIT 100", if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { libs_vec = libs .split(&[',', '\'', ' ']) - .filter(|s| *s != "neon" && !s.is_empty()) + .filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty()) .map(str::to_string) .collect(); } @@ -2531,7 +2589,7 @@ LIMIT 100", if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) { preload_libs_vec = libs .split(&[',', '\'', ' ']) - .filter(|s| *s != "neon" && !s.is_empty()) + .filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty()) .map(str::to_string) .collect(); } From 65d1be6e901a1a33ce31a06c5a5d63b481f24d14 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 29 Jul 2025 18:28:57 +0200 Subject: [PATCH 15/34] pageserver: route gRPC requests to child shards (#12702) ## Problem During shard splits, each parent shard is split and removed incrementally. Only when all parent shards have split is the split committed and the compute notified. This can take several minutes for large tenants. In the meanwhile, the compute will be sending requests to the (now-removed) parent shards. This was (mostly) not a problem for the libpq protocol, because it does shard routing on the server-side. The compute just sends requests to some Pageserver, and the server will figure out which local shard should serve it. It is a problem for the gRPC protocol, where the client explicitly says which shard it's talking to. Touches [LKB-191](https://databricks.atlassian.net/browse/LKB-191). Requires #12772. ## Summary of changes * Add server-side routing of gRPC requests to any local child shards if the parent does not exist. * Add server-side splitting of GetPage batch requests straddling multiple child shards. * Move the `GetPageSplitter` into `pageserver_page_api`. I really don't like this approach, but it avoids making changes to the split protocol. I could be convinced we should change the split protocol instead, e.g. to keep the parent shard alive until the split commits and the compute has been notified, but we can also do that as a later change without blocking the communicator on it. --- pageserver/client_grpc/src/client.rs | 2 +- pageserver/client_grpc/src/lib.rs | 1 - pageserver/page_api/src/lib.rs | 4 +- .../{client_grpc => page_api}/src/split.rs | 28 +- pageserver/src/page_service.rs | 292 +++++++++++++----- pageserver/src/tenant/mgr.rs | 19 +- 6 files changed, 256 insertions(+), 90 deletions(-) rename pageserver/{client_grpc => page_api}/src/split.rs (91%) diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs index e6a90fb582..b8ee57bf9f 100644 --- a/pageserver/client_grpc/src/client.rs +++ b/pageserver/client_grpc/src/client.rs @@ -14,9 +14,9 @@ use utils::logging::warn_slow; use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool}; use crate::retry::Retry; -use crate::split::GetPageSplitter; use compute_api::spec::PageserverProtocol; use pageserver_page_api as page_api; +use pageserver_page_api::GetPageSplitter; use utils::id::{TenantId, TimelineId}; use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize}; diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs index 14fb3fbd5a..4999fd3d0a 100644 --- a/pageserver/client_grpc/src/lib.rs +++ b/pageserver/client_grpc/src/lib.rs @@ -1,6 +1,5 @@ mod client; mod pool; mod retry; -mod split; pub use client::{PageserverClient, ShardSpec}; diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs index e78f6ce206..b44df6337f 100644 --- a/pageserver/page_api/src/lib.rs +++ b/pageserver/page_api/src/lib.rs @@ -19,7 +19,9 @@ pub mod proto { } mod client; -pub use client::Client; mod model; +mod split; +pub use client::Client; pub use model::*; +pub use split::GetPageSplitter; diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/page_api/src/split.rs similarity index 91% rename from pageserver/client_grpc/src/split.rs rename to pageserver/page_api/src/split.rs index 8631638686..5ecc90a166 100644 --- a/pageserver/client_grpc/src/split.rs +++ b/pageserver/page_api/src/split.rs @@ -3,18 +3,18 @@ use std::collections::HashMap; use anyhow::anyhow; use bytes::Bytes; +use crate::model::*; use pageserver_api::key::rel_block_to_key; use pageserver_api::shard::key_to_shard_number; -use pageserver_page_api as page_api; use utils::shard::{ShardCount, ShardIndex, ShardStripeSize}; /// Splits GetPageRequests that straddle shard boundaries and assembles the responses. /// TODO: add tests for this. pub struct GetPageSplitter { /// Split requests by shard index. - requests: HashMap, + requests: HashMap, /// The response being assembled. Preallocated with empty pages, to be filled in. - response: page_api::GetPageResponse, + response: GetPageResponse, /// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used /// to assemble the response pages in the same order as the original request. block_shards: Vec, @@ -24,7 +24,7 @@ impl GetPageSplitter { /// Checks if the given request only touches a single shard, and returns the shard ID. This is /// the common case, so we check first in order to avoid unnecessary allocations and overhead. pub fn for_single_shard( - req: &page_api::GetPageRequest, + req: &GetPageRequest, count: ShardCount, stripe_size: Option, ) -> anyhow::Result> { @@ -57,7 +57,7 @@ impl GetPageSplitter { /// Splits the given request. pub fn split( - req: page_api::GetPageRequest, + req: GetPageRequest, count: ShardCount, stripe_size: Option, ) -> anyhow::Result { @@ -84,7 +84,7 @@ impl GetPageSplitter { requests .entry(shard_id) - .or_insert_with(|| page_api::GetPageRequest { + .or_insert_with(|| GetPageRequest { request_id: req.request_id, request_class: req.request_class, rel: req.rel, @@ -98,16 +98,16 @@ impl GetPageSplitter { // Construct a response to be populated by shard responses. Preallocate empty page slots // with the expected block numbers. - let response = page_api::GetPageResponse { + let response = GetPageResponse { request_id: req.request_id, - status_code: page_api::GetPageStatusCode::Ok, + status_code: GetPageStatusCode::Ok, reason: None, rel: req.rel, pages: req .block_numbers .into_iter() .map(|block_number| { - page_api::Page { + Page { block_number, image: Bytes::new(), // empty page slot to be filled in } @@ -123,9 +123,7 @@ impl GetPageSplitter { } /// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations. - pub fn drain_requests( - &mut self, - ) -> impl Iterator { + pub fn drain_requests(&mut self) -> impl Iterator { self.requests.drain() } @@ -135,10 +133,10 @@ impl GetPageSplitter { pub fn add_response( &mut self, shard_id: ShardIndex, - response: page_api::GetPageResponse, + response: GetPageResponse, ) -> anyhow::Result<()> { // The caller should already have converted status codes into tonic::Status. - if response.status_code != page_api::GetPageStatusCode::Ok { + if response.status_code != GetPageStatusCode::Ok { return Err(anyhow!( "unexpected non-OK response for shard {shard_id}: {} {}", response.status_code, @@ -209,7 +207,7 @@ impl GetPageSplitter { /// Fetches the final, assembled response. #[allow(clippy::result_large_err)] - pub fn get_response(self) -> anyhow::Result { + pub fn get_response(self) -> anyhow::Result { // Check that the response is complete. for (i, page) in self.response.pages.iter().enumerate() { if page.image.is_empty() { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b3bc42a55e..f16046657a 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -16,7 +16,8 @@ use anyhow::{Context as _, bail}; use bytes::{Buf as _, BufMut as _, BytesMut}; use chrono::Utc; use futures::future::BoxFuture; -use futures::{FutureExt, Stream}; +use futures::stream::FuturesUnordered; +use futures::{FutureExt, Stream, StreamExt as _}; use itertools::Itertools; use jsonwebtoken::TokenData; use once_cell::sync::OnceCell; @@ -35,8 +36,8 @@ use pageserver_api::pagestream_api::{ }; use pageserver_api::reltag::SlruKind; use pageserver_api::shard::TenantShardId; -use pageserver_page_api as page_api; use pageserver_page_api::proto; +use pageserver_page_api::{self as page_api, GetPageSplitter}; use postgres_backend::{ AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error, }; @@ -3423,18 +3424,6 @@ impl GrpcPageServiceHandler { Ok(CancellableTask { task, cancel }) } - /// Errors if the request is executed on a non-zero shard. Only shard 0 has a complete view of - /// relations and their sizes, as well as SLRU segments and similar data. - #[allow(clippy::result_large_err)] - fn ensure_shard_zero(timeline: &Handle) -> Result<(), tonic::Status> { - match timeline.get_shard_index().shard_number.0 { - 0 => Ok(()), - shard => Err(tonic::Status::invalid_argument(format!( - "request must execute on shard zero (is shard {shard})", - ))), - } - } - /// Generates a PagestreamRequest header from a ReadLsn and request ID. fn make_hdr( read_lsn: page_api::ReadLsn, @@ -3449,30 +3438,72 @@ impl GrpcPageServiceHandler { } } - /// Acquires a timeline handle for the given request. + /// Acquires a timeline handle for the given request. The shard index must match a local shard. /// - /// TODO: during shard splits, the compute may still be sending requests to the parent shard - /// until the entire split is committed and the compute is notified. Consider installing a - /// temporary shard router from the parent to the children while the split is in progress. - /// - /// TODO: consider moving this to a middleware layer; all requests need it. Needs to manage - /// the TimelineHandles lifecycle. - /// - /// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to avoid - /// the unnecessary overhead. + /// NB: this will fail during shard splits, see comment on [`Self::maybe_split_get_page`]. async fn get_request_timeline( &self, req: &tonic::Request, ) -> Result, GetActiveTimelineError> { - let ttid = *extract::(req); + let TenantTimelineId { + tenant_id, + timeline_id, + } = *extract::(req); let shard_index = *extract::(req); - let shard_selector = ShardSelector::Known(shard_index); + // TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to + // avoid the unnecessary overhead. TimelineHandles::new(self.tenant_manager.clone()) - .get(ttid.tenant_id, ttid.timeline_id, shard_selector) + .get(tenant_id, timeline_id, ShardSelector::Known(shard_index)) .await } + /// Acquires a timeline handle for the given request, which must be for shard zero. Most + /// metadata requests are only valid on shard zero. + /// + /// NB: during an ongoing shard split, the compute will keep talking to the parent shard until + /// the split is committed, but the parent shard may have been removed in the meanwhile. In that + /// case, we reroute the request to the new child shard. See [`Self::maybe_split_get_page`]. + /// + /// TODO: revamp the split protocol to avoid this child routing. + async fn get_request_timeline_shard_zero( + &self, + req: &tonic::Request, + ) -> Result, tonic::Status> { + let TenantTimelineId { + tenant_id, + timeline_id, + } = *extract::(req); + let shard_index = *extract::(req); + + if shard_index.shard_number.0 != 0 { + return Err(tonic::Status::invalid_argument(format!( + "request only valid on shard zero (requested shard {shard_index})", + ))); + } + + // TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to + // avoid the unnecessary overhead. + let mut handles = TimelineHandles::new(self.tenant_manager.clone()); + match handles + .get(tenant_id, timeline_id, ShardSelector::Known(shard_index)) + .await + { + Ok(timeline) => Ok(timeline), + Err(err) => { + // We may be in the middle of a shard split. Try to find a child shard 0. + if let Ok(timeline) = handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await + && timeline.get_shard_index().shard_count > shard_index.shard_count + { + return Ok(timeline); + } + Err(err.into()) + } + } + } + /// Starts a SmgrOpTimer at received_at, throttles the request, and records execution start. /// Only errors if the timeline is shutting down. /// @@ -3502,28 +3533,22 @@ impl GrpcPageServiceHandler { /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or /// split them up in the client or server. - #[instrument(skip_all, fields(req_id, rel, blkno, blks, req_lsn, mod_lsn))] + #[instrument(skip_all, fields( + req_id = %req.request_id, + rel = %req.rel, + blkno = %req.block_numbers[0], + blks = %req.block_numbers.len(), + lsn = %req.read_lsn, + ))] async fn get_page( ctx: &RequestContext, - timeline: &WeakHandle, - req: proto::GetPageRequest, + timeline: Handle, + req: page_api::GetPageRequest, io_concurrency: IoConcurrency, - ) -> Result { - let received_at = Instant::now(); - let timeline = timeline.upgrade()?; + received_at: Instant, + ) -> Result { let ctx = ctx.with_scope_page_service_pagestream(&timeline); - // Validate the request, decorate the span, and convert it to a Pagestream request. - let req = page_api::GetPageRequest::try_from(req)?; - - span_record!( - req_id = %req.request_id, - rel = %req.rel, - blkno = %req.block_numbers[0], - blks = %req.block_numbers.len(), - lsn = %req.read_lsn, - ); - for &blkno in &req.block_numbers { let shard = timeline.get_shard_identity(); let key = rel_block_to_key(req.rel, blkno); @@ -3611,7 +3636,95 @@ impl GrpcPageServiceHandler { }; } - Ok(resp.into()) + Ok(resp) + } + + /// Processes a GetPage request when there is a potential shard split in progress. We have to + /// reroute the request to any local child shards, and split batch requests that straddle + /// multiple child shards. + /// + /// Parent shards are split and removed incrementally (there may be many parent shards when + /// splitting an already-sharded tenant), but the compute is only notified once the overall + /// split commits, which can take several minutes. In the meanwhile, the compute will be sending + /// requests to the parent shards. + /// + /// TODO: add test infrastructure to provoke this situation frequently and for long periods of + /// time, to properly exercise it. + /// + /// TODO: revamp the split protocol to avoid this, e.g.: + /// * Keep the parent shard until the split commits and the compute is notified. + /// * Notify the compute about each subsplit. + /// * Return an error that updates the compute's shard map. + #[instrument(skip_all)] + #[allow(clippy::too_many_arguments)] + async fn maybe_split_get_page( + ctx: &RequestContext, + handles: &mut TimelineHandles, + tenant_id: TenantId, + timeline_id: TimelineId, + parent: ShardIndex, + req: page_api::GetPageRequest, + io_concurrency: IoConcurrency, + received_at: Instant, + ) -> Result { + // Check the first page to see if we have any child shards at all. Otherwise, the compute is + // just talking to the wrong Pageserver. If the parent has been split, the shard now owning + // the page must have a higher shard count. + let timeline = handles + .get( + tenant_id, + timeline_id, + ShardSelector::Page(rel_block_to_key(req.rel, req.block_numbers[0])), + ) + .await?; + + let shard_id = timeline.get_shard_identity(); + if shard_id.count <= parent.shard_count { + return Err(HandleUpgradeError::ShutDown.into()); // emulate original error + } + + // Fast path: the request fits in a single shard. + if let Some(shard_index) = + GetPageSplitter::for_single_shard(&req, shard_id.count, Some(shard_id.stripe_size)) + .map_err(|err| tonic::Status::internal(err.to_string()))? + { + // We got the shard ID from the first page, so these must be equal. + assert_eq!(shard_index.shard_number, shard_id.number); + assert_eq!(shard_index.shard_count, shard_id.count); + return Self::get_page(ctx, timeline, req, io_concurrency, received_at).await; + } + + // The request spans multiple shards; split it and dispatch parallel requests. All pages + // were originally in the parent shard, and during a split all children are local, so we + // expect to find local shards for all pages. + let mut splitter = GetPageSplitter::split(req, shard_id.count, Some(shard_id.stripe_size)) + .map_err(|err| tonic::Status::internal(err.to_string()))?; + + let mut shard_requests = FuturesUnordered::new(); + for (shard_index, shard_req) in splitter.drain_requests() { + let timeline = handles + .get(tenant_id, timeline_id, ShardSelector::Known(shard_index)) + .await?; + let future = Self::get_page( + ctx, + timeline, + shard_req, + io_concurrency.clone(), + received_at, + ) + .map(move |result| result.map(|resp| (shard_index, resp))); + shard_requests.push(future); + } + + while let Some((shard_index, shard_response)) = shard_requests.next().await.transpose()? { + splitter + .add_response(shard_index, shard_response) + .map_err(|err| tonic::Status::internal(err.to_string()))?; + } + + splitter + .get_response() + .map_err(|err| tonic::Status::internal(err.to_string())) } } @@ -3640,11 +3753,10 @@ impl proto::PageService for GrpcPageServiceHandler { // to be the sweet spot where throughput is saturated. const CHUNK_SIZE: usize = 256 * 1024; - let timeline = self.get_request_timeline(&req).await?; + let timeline = self.get_request_timeline_shard_zero(&req).await?; let ctx = self.ctx.with_scope_timeline(&timeline); // Validate the request and decorate the span. - Self::ensure_shard_zero(&timeline)?; if timeline.is_archived() == Some(true) { return Err(tonic::Status::failed_precondition("timeline is archived")); } @@ -3760,11 +3872,10 @@ impl proto::PageService for GrpcPageServiceHandler { req: tonic::Request, ) -> Result, tonic::Status> { let received_at = extract::(&req).0; - let timeline = self.get_request_timeline(&req).await?; + let timeline = self.get_request_timeline_shard_zero(&req).await?; let ctx = self.ctx.with_scope_page_service_pagestream(&timeline); // Validate the request, decorate the span, and convert it to a Pagestream request. - Self::ensure_shard_zero(&timeline)?; let req: page_api::GetDbSizeRequest = req.into_inner().try_into()?; span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn); @@ -3793,14 +3904,29 @@ impl proto::PageService for GrpcPageServiceHandler { req: tonic::Request>, ) -> Result, tonic::Status> { // Extract the timeline from the request and check that it exists. - let ttid = *extract::(&req); + // + // NB: during shard splits, the compute may still send requests to the parent shard. We'll + // reroute requests to the child shards below, but we also detect the common cases here + // where either the shard exists or no shards exist at all. If we have a child shard, we + // can't acquire a weak handle because we don't know which child shard to use yet. + let TenantTimelineId { + tenant_id, + timeline_id, + } = *extract::(&req); let shard_index = *extract::(&req); - let shard_selector = ShardSelector::Known(shard_index); let mut handles = TimelineHandles::new(self.tenant_manager.clone()); - handles - .get(ttid.tenant_id, ttid.timeline_id, shard_selector) - .await?; + let timeline = match handles + .get(tenant_id, timeline_id, ShardSelector::Known(shard_index)) + .await + { + // The timeline shard exists. Keep a weak handle to reuse for each request. + Ok(timeline) => Some(timeline.downgrade()), + // The shard doesn't exist, but a child shard does. We'll reroute requests later. + Err(_) if self.tenant_manager.has_child_shard(tenant_id, shard_index) => None, + // Failed to fetch the timeline, and no child shard exists. Error out. + Err(err) => return Err(err.into()), + }; // Spawn an IoConcurrency sidecar, if enabled. let gate_guard = self @@ -3817,11 +3943,9 @@ impl proto::PageService for GrpcPageServiceHandler { let mut reqs = req.into_inner(); let resps = async_stream::try_stream! { - let timeline = handles - .get(ttid.tenant_id, ttid.timeline_id, shard_selector) - .await? - .downgrade(); loop { + // Wait for the next client request. + // // NB: Tonic considers the entire stream to be an in-flight request and will wait // for it to complete before shutting down. React to cancellation between requests. let req = tokio::select! { @@ -3834,16 +3958,44 @@ impl proto::PageService for GrpcPageServiceHandler { Err(err) => Err(err), }, }?; + + let received_at = Instant::now(); let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default(); - let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone()) + + // Process the request, using a closure to capture errors. + let process_request = async || { + let req = page_api::GetPageRequest::try_from(req)?; + + // Fast path: use the pre-acquired timeline handle. + if let Some(Ok(timeline)) = timeline.as_ref().map(|t| t.upgrade()) { + return Self::get_page(&ctx, timeline, req, io_concurrency.clone(), received_at) + .instrument(span.clone()) // propagate request span + .await + } + + // The timeline handle is stale. During shard splits, the compute may still be + // sending requests to the parent shard. Try to re-route requests to the child + // shards, and split any batch requests that straddle multiple child shards. + Self::maybe_split_get_page( + &ctx, + &mut handles, + tenant_id, + timeline_id, + shard_index, + req, + io_concurrency.clone(), + received_at, + ) .instrument(span.clone()) // propagate request span - .await; - yield match result { - Ok(resp) => resp, - // Convert per-request errors to GetPageResponses as appropriate, or terminate - // the stream with a tonic::Status. Log the error regardless, since - // ObservabilityLayer can't automatically log stream errors. + .await + }; + + // Return the response. Convert per-request errors to GetPageResponses if + // appropriate, or terminate the stream with a tonic::Status. + yield match process_request().await { + Ok(resp) => resp.into(), Err(status) => { + // Log the error, since ObservabilityLayer won't see stream errors. // TODO: it would be nice if we could propagate the get_page() fields here. span.in_scope(|| { warn!("request failed with {:?}: {}", status.code(), status.message()); @@ -3863,11 +4015,10 @@ impl proto::PageService for GrpcPageServiceHandler { req: tonic::Request, ) -> Result, tonic::Status> { let received_at = extract::(&req).0; - let timeline = self.get_request_timeline(&req).await?; + let timeline = self.get_request_timeline_shard_zero(&req).await?; let ctx = self.ctx.with_scope_page_service_pagestream(&timeline); // Validate the request, decorate the span, and convert it to a Pagestream request. - Self::ensure_shard_zero(&timeline)?; let req: page_api::GetRelSizeRequest = req.into_inner().try_into()?; let allow_missing = req.allow_missing; @@ -3900,11 +4051,10 @@ impl proto::PageService for GrpcPageServiceHandler { req: tonic::Request, ) -> Result, tonic::Status> { let received_at = extract::(&req).0; - let timeline = self.get_request_timeline(&req).await?; + let timeline = self.get_request_timeline_shard_zero(&req).await?; let ctx = self.ctx.with_scope_page_service_pagestream(&timeline); // Validate the request, decorate the span, and convert it to a Pagestream request. - Self::ensure_shard_zero(&timeline)?; let req: page_api::GetSlruSegmentRequest = req.into_inner().try_into()?; span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn); @@ -3934,6 +4084,10 @@ impl proto::PageService for GrpcPageServiceHandler { &self, req: tonic::Request, ) -> Result, tonic::Status> { + // TODO: this won't work during shard splits, as the request is directed at a specific shard + // but the parent shard is removed before the split commits and the compute is notified + // (which can take several minutes for large tenants). That's also the case for the libpq + // implementation, so we keep the behavior for now. let timeline = self.get_request_timeline(&req).await?; let ctx = self.ctx.with_scope_timeline(&timeline); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 4432b4bba8..0feba5e9c8 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -826,6 +826,18 @@ impl TenantManager { peek_slot.is_some() } + /// Returns whether a local shard exists that's a child of the given tenant shard. Note that + /// this just checks for any shard with a larger shard count, and it may not be a direct child + /// of the given shard (their keyspace may not overlap). + pub(crate) fn has_child_shard(&self, tenant_id: TenantId, shard_index: ShardIndex) -> bool { + match &*self.tenants.read().unwrap() { + TenantsMap::Initializing => false, + TenantsMap::Open(slots) | TenantsMap::ShuttingDown(slots) => slots + .range(TenantShardId::tenant_range(tenant_id)) + .any(|(tsid, _)| tsid.shard_count > shard_index.shard_count), + } + } + #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(crate) async fn upsert_location( &self, @@ -1524,9 +1536,10 @@ impl TenantManager { // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant // // TODO: keeping the parent as InProgress while spawning the children causes read - // unavailability, as we can't acquire a timeline handle for it. The parent should be - // available for reads until the children are ready -- potentially until *all* subsplits - // across all parent shards are complete and the compute has been notified. See: + // unavailability, as we can't acquire a new timeline handle for it (existing handles appear + // to still work though, even downgraded ones). The parent should be available for reads + // until the children are ready -- potentially until *all* subsplits across all parent + // shards are complete and the compute has been notified. See: // . drop(tenant); let mut parent_slot_guard = From bf3a1529bf9555ece4447452978124218915105e Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Tue, 29 Jul 2025 14:08:24 -0400 Subject: [PATCH 16/34] Report metrics on data/index corruption (#12729) ## Problem We don't have visibility into data/index corruption. ## Summary of changes Add data/index corruptions metrics. PG calls elog ERROR errcode to emit these corruption errors. PG Changes: https://github.com/neondatabase/postgres/pull/698 --- pgxn/neon/libpagestore.c | 1 - pgxn/neon/neon.c | 38 +++++++++++++++++++++ pgxn/neon/neon_perf_counters.c | 62 ++++++++++++++++++++++++++++++++-- pgxn/neon/neon_perf_counters.h | 18 ++++++++++ 4 files changed, 115 insertions(+), 4 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 1031f185a6..158118f860 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -1440,7 +1440,6 @@ check_neon_id(char **newval, void **extra, GucSource source) return **newval == '\0' || HexDecodeString(id, *newval, 16); } - void PagestoreShmemInit(void) { diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 07de696d7b..e831fca7f8 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -51,6 +51,7 @@ void _PG_init(void); bool lakebase_mode = false; static int running_xacts_overflow_policy; +static emit_log_hook_type prev_emit_log_hook; static bool monitor_query_exec_time = false; static ExecutorStart_hook_type prev_ExecutorStart = NULL; @@ -81,6 +82,8 @@ uint32 WAIT_EVENT_NEON_PS_READ; uint32 WAIT_EVENT_NEON_WAL_DL; #endif +int databricks_test_hook = 0; + enum RunningXactsOverflowPolicies { OP_IGNORE, OP_SKIP, @@ -445,6 +448,20 @@ ReportSearchPath(void) static int neon_pgstat_file_size_limit; #endif +static void DatabricksSqlErrorHookImpl(ErrorData *edata) { + if (prev_emit_log_hook != NULL) { + prev_emit_log_hook(edata); + } + + if (edata->sqlerrcode == ERRCODE_DATA_CORRUPTED) { + pg_atomic_fetch_add_u32(&databricks_metrics_shared->data_corruption_count, 1); + } else if (edata->sqlerrcode == ERRCODE_INDEX_CORRUPTED) { + pg_atomic_fetch_add_u32(&databricks_metrics_shared->index_corruption_count, 1); + } else if (edata->sqlerrcode == ERRCODE_INTERNAL_ERROR) { + pg_atomic_fetch_add_u32(&databricks_metrics_shared->internal_error_count, 1); + } +} + void _PG_init(void) { @@ -456,6 +473,11 @@ _PG_init(void) load_file("$libdir/neon_rmgr", false); #endif + if (lakebase_mode) { + prev_emit_log_hook = emit_log_hook; + emit_log_hook = DatabricksSqlErrorHookImpl; + } + /* * Initializing a pre-loaded Postgres extension happens in three stages: * @@ -594,6 +616,19 @@ _PG_init(void) 0, NULL, NULL, NULL); + // A test hook used in sql regress to trigger specific behaviors + // to test features easily. + DefineCustomIntVariable( + "databricks.test_hook", + "The test hook used in sql regress tests only", + NULL, + &databricks_test_hook, + 0, + 0, INT32_MAX, + PGC_SUSET, + 0, + NULL, NULL, NULL); + /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the @@ -816,6 +851,9 @@ neon_shmem_startup_hook(void) LfcShmemInit(); NeonPerfCountersShmemInit(); + if (lakebase_mode) { + DatabricksMetricsShmemInit(); + } PagestoreShmemInit(); RelsizeCacheShmemInit(); WalproposerShmemInit(); diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index dd576e4e73..a38f876a0c 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -19,7 +19,35 @@ #include "neon.h" #include "neon_perf_counters.h" -#include "neon_pgversioncompat.h" +#include "walproposer.h" + +/* BEGIN_HADRON */ +databricks_metrics *databricks_metrics_shared; + +Size +DatabricksMetricsShmemSize(void) +{ + return sizeof(databricks_metrics); +} + +void +DatabricksMetricsShmemInit(void) +{ + bool found; + + databricks_metrics_shared = + ShmemInitStruct("Databricks counters", + DatabricksMetricsShmemSize(), + &found); + Assert(found == IsUnderPostmaster); + if (!found) + { + pg_atomic_init_u32(&databricks_metrics_shared->index_corruption_count, 0); + pg_atomic_init_u32(&databricks_metrics_shared->data_corruption_count, 0); + pg_atomic_init_u32(&databricks_metrics_shared->internal_error_count, 0); + } +} +/* END_HADRON */ neon_per_backend_counters *neon_per_backend_counters_shared; @@ -38,11 +66,12 @@ NeonPerfCountersShmemRequest(void) #else size = mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters)); #endif + if (lakebase_mode) { + size = add_size(size, DatabricksMetricsShmemSize()); + } RequestAddinShmemSpace(size); } - - void NeonPerfCountersShmemInit(void) { @@ -395,6 +424,33 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) metric_to_datums(&metrics[i], &values[0], &nulls[0]); tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } + + if (lakebase_mode) { + + if (databricks_test_hook == TestHookCorruption) { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("test corruption"))); + } + + // Not ideal but piggyback our databricks counters into the neon perf counters view + // so that we don't need to introduce neon--1.x+1.sql to add a new view. + { + metric_t databricks_metrics[] = { + {"sql_index_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->index_corruption_count)}, + {"sql_data_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->data_corruption_count)}, + {"sql_internal_error_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->internal_error_count)}, + {NULL, false, 0, 0}, + }; + for (int i = 0; databricks_metrics[i].name != NULL; i++) + { + metric_to_datums(&databricks_metrics[i], &values[0], &nulls[0]); + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } + } + /* END_HADRON */ + } + pfree(metrics); return (Datum) 0; diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index bc4efddee5..0196559806 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -177,5 +177,23 @@ extern void inc_query_time(uint64 elapsed); extern Size NeonPerfCountersShmemSize(void); extern void NeonPerfCountersShmemInit(void); +/* BEGIN_HADRON */ +typedef struct +{ + pg_atomic_uint32 index_corruption_count; + pg_atomic_uint32 data_corruption_count; + pg_atomic_uint32 internal_error_count; +} databricks_metrics; + +extern databricks_metrics *databricks_metrics_shared; + +extern Size DatabricksMetricsShmemSize(void); +extern void DatabricksMetricsShmemInit(void); + +extern int databricks_test_hook; + +static const int TestHookCorruption = 1; +/* END_HADRON */ + #endif /* NEON_PERF_COUNTERS_H */ From 7cd0066212706d585b1f966f7bd68032458f7c89 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 29 Jul 2025 20:26:20 +0200 Subject: [PATCH 17/34] page_api: add `SplitError` for `GetPageSplitter` (#12709) Add a `SplitError` for `GetPageSplitter`, with an `Into` implementation. This avoids a bunch of boilerplate to convert `GetPageSplitter` errors into `tonic::Status`. Requires #12702. Touches [LKB-191](https://databricks.atlassian.net/browse/LKB-191). --- pageserver/client_grpc/src/client.rs | 14 ++--- pageserver/page_api/src/lib.rs | 2 +- pageserver/page_api/src/split.rs | 89 +++++++++++++++++----------- pageserver/src/page_service.rs | 14 ++--- 4 files changed, 63 insertions(+), 56 deletions(-) diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs index b8ee57bf9f..dad37ebe74 100644 --- a/pageserver/client_grpc/src/client.rs +++ b/pageserver/client_grpc/src/client.rs @@ -230,16 +230,14 @@ impl PageserverClient { ) -> tonic::Result { // Fast path: request is for a single shard. if let Some(shard_id) = - GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size) - .map_err(|err| tonic::Status::internal(err.to_string()))? + GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)? { return Self::get_page_with_shard(req, shards.get(shard_id)?).await; } // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and // reassemble the responses. - let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size) - .map_err(|err| tonic::Status::internal(err.to_string()))?; + let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size)?; let mut shard_requests = FuturesUnordered::new(); for (shard_id, shard_req) in splitter.drain_requests() { @@ -249,14 +247,10 @@ impl PageserverClient { } while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? { - splitter - .add_response(shard_id, shard_response) - .map_err(|err| tonic::Status::internal(err.to_string()))?; + splitter.add_response(shard_id, shard_response)?; } - splitter - .get_response() - .map_err(|err| tonic::Status::internal(err.to_string())) + Ok(splitter.collect_response()?) } /// Fetches pages on the given shard. Does not retry internally. diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs index b44df6337f..b9be6b8b91 100644 --- a/pageserver/page_api/src/lib.rs +++ b/pageserver/page_api/src/lib.rs @@ -24,4 +24,4 @@ mod split; pub use client::Client; pub use model::*; -pub use split::GetPageSplitter; +pub use split::{GetPageSplitter, SplitError}; diff --git a/pageserver/page_api/src/split.rs b/pageserver/page_api/src/split.rs index 5ecc90a166..27c1c995e0 100644 --- a/pageserver/page_api/src/split.rs +++ b/pageserver/page_api/src/split.rs @@ -1,6 +1,5 @@ use std::collections::HashMap; -use anyhow::anyhow; use bytes::Bytes; use crate::model::*; @@ -27,19 +26,19 @@ impl GetPageSplitter { req: &GetPageRequest, count: ShardCount, stripe_size: Option, - ) -> anyhow::Result> { + ) -> Result, SplitError> { // Fast path: unsharded tenant. if count.is_unsharded() { return Ok(Some(ShardIndex::unsharded())); } let Some(stripe_size) = stripe_size else { - return Err(anyhow!("stripe size must be given for sharded tenants")); + return Err("stripe size must be given for sharded tenants".into()); }; // Find the first page's shard, for comparison. let Some(&first_page) = req.block_numbers.first() else { - return Err(anyhow!("no block numbers in request")); + return Err("no block numbers in request".into()); }; let key = rel_block_to_key(req.rel, first_page); let shard_number = key_to_shard_number(count, stripe_size, &key); @@ -60,7 +59,7 @@ impl GetPageSplitter { req: GetPageRequest, count: ShardCount, stripe_size: Option, - ) -> anyhow::Result { + ) -> Result { // The caller should make sure we don't split requests unnecessarily. debug_assert!( Self::for_single_shard(&req, count, stripe_size)?.is_none(), @@ -68,10 +67,10 @@ impl GetPageSplitter { ); if count.is_unsharded() { - return Err(anyhow!("unsharded tenant, no point in splitting request")); + return Err("unsharded tenant, no point in splitting request".into()); } let Some(stripe_size) = stripe_size else { - return Err(anyhow!("stripe size must be given for sharded tenants")); + return Err("stripe size must be given for sharded tenants".into()); }; // Split the requests by shard index. @@ -129,35 +128,32 @@ impl GetPageSplitter { /// Adds a response from the given shard. The response must match the request ID and have an OK /// status code. A response must not already exist for the given shard ID. - #[allow(clippy::result_large_err)] pub fn add_response( &mut self, shard_id: ShardIndex, response: GetPageResponse, - ) -> anyhow::Result<()> { + ) -> Result<(), SplitError> { // The caller should already have converted status codes into tonic::Status. if response.status_code != GetPageStatusCode::Ok { - return Err(anyhow!( + return Err(SplitError(format!( "unexpected non-OK response for shard {shard_id}: {} {}", response.status_code, response.reason.unwrap_or_default() - )); + ))); } if response.request_id != self.response.request_id { - return Err(anyhow!( + return Err(SplitError(format!( "response ID mismatch for shard {shard_id}: expected {}, got {}", - self.response.request_id, - response.request_id - )); + self.response.request_id, response.request_id + ))); } if response.request_id != self.response.request_id { - return Err(anyhow!( + return Err(SplitError(format!( "response ID mismatch for shard {shard_id}: expected {}, got {}", - self.response.request_id, - response.request_id - )); + self.response.request_id, response.request_id + ))); } // Place the shard response pages into the assembled response, in request order. @@ -169,26 +165,27 @@ impl GetPageSplitter { } let Some(slot) = self.response.pages.get_mut(i) else { - return Err(anyhow!("no block_shards slot {i} for shard {shard_id}")); + return Err(SplitError(format!( + "no block_shards slot {i} for shard {shard_id}" + ))); }; let Some(page) = pages.next() else { - return Err(anyhow!( + return Err(SplitError(format!( "missing page {} in shard {shard_id} response", slot.block_number - )); + ))); }; if page.block_number != slot.block_number { - return Err(anyhow!( + return Err(SplitError(format!( "shard {shard_id} returned wrong page at index {i}, expected {} got {}", - slot.block_number, - page.block_number - )); + slot.block_number, page.block_number + ))); } if !slot.image.is_empty() { - return Err(anyhow!( + return Err(SplitError(format!( "shard {shard_id} returned duplicate page {} at index {i}", slot.block_number - )); + ))); } *slot = page; @@ -196,32 +193,54 @@ impl GetPageSplitter { // Make sure we've consumed all pages from the shard response. if let Some(extra_page) = pages.next() { - return Err(anyhow!( + return Err(SplitError(format!( "shard {shard_id} returned extra page: {}", extra_page.block_number - )); + ))); } Ok(()) } - /// Fetches the final, assembled response. - #[allow(clippy::result_large_err)] - pub fn get_response(self) -> anyhow::Result { + /// Collects the final, assembled response. + pub fn collect_response(self) -> Result { // Check that the response is complete. for (i, page) in self.response.pages.iter().enumerate() { if page.image.is_empty() { - return Err(anyhow!( + return Err(SplitError(format!( "missing page {} for shard {}", page.block_number, self.block_shards .get(i) .map(|s| s.to_string()) .unwrap_or_else(|| "?".to_string()) - )); + ))); } } Ok(self.response) } } + +/// A GetPageSplitter error. +#[derive(Debug, thiserror::Error)] +#[error("{0}")] +pub struct SplitError(String); + +impl From<&str> for SplitError { + fn from(err: &str) -> Self { + SplitError(err.to_string()) + } +} + +impl From for SplitError { + fn from(err: String) -> Self { + SplitError(err) + } +} + +impl From for tonic::Status { + fn from(err: SplitError) -> Self { + tonic::Status::internal(err.0) + } +} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f16046657a..116e289e99 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -3685,8 +3685,7 @@ impl GrpcPageServiceHandler { // Fast path: the request fits in a single shard. if let Some(shard_index) = - GetPageSplitter::for_single_shard(&req, shard_id.count, Some(shard_id.stripe_size)) - .map_err(|err| tonic::Status::internal(err.to_string()))? + GetPageSplitter::for_single_shard(&req, shard_id.count, Some(shard_id.stripe_size))? { // We got the shard ID from the first page, so these must be equal. assert_eq!(shard_index.shard_number, shard_id.number); @@ -3697,8 +3696,7 @@ impl GrpcPageServiceHandler { // The request spans multiple shards; split it and dispatch parallel requests. All pages // were originally in the parent shard, and during a split all children are local, so we // expect to find local shards for all pages. - let mut splitter = GetPageSplitter::split(req, shard_id.count, Some(shard_id.stripe_size)) - .map_err(|err| tonic::Status::internal(err.to_string()))?; + let mut splitter = GetPageSplitter::split(req, shard_id.count, Some(shard_id.stripe_size))?; let mut shard_requests = FuturesUnordered::new(); for (shard_index, shard_req) in splitter.drain_requests() { @@ -3717,14 +3715,10 @@ impl GrpcPageServiceHandler { } while let Some((shard_index, shard_response)) = shard_requests.next().await.transpose()? { - splitter - .add_response(shard_index, shard_response) - .map_err(|err| tonic::Status::internal(err.to_string()))?; + splitter.add_response(shard_index, shard_response)?; } - splitter - .get_response() - .map_err(|err| tonic::Status::internal(err.to_string())) + Ok(splitter.collect_response()?) } } From 07c3cfd2a0935d91eb9da859b65ce6a28a4d5c04 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Tue, 29 Jul 2025 16:40:07 -0400 Subject: [PATCH 18/34] =?UTF-8?q?[BRC-2905]=20Feed=20back=20PS-detected=20?= =?UTF-8?q?data=20corruption=20signals=20to=20SK=20and=20PG=E2=80=A6=20(#1?= =?UTF-8?q?2748)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … walproposer (#895) Data corruptions are typically detected on the pageserver side when it replays WAL records. However, since PS doesn't synchronously replay WAL records as they are being ingested through safekeepers, we need some extra plumbing to feed information about pageserver-detected corruptions during compaction (and/or WAL redo in general) back to SK and PG for proper action. We don't yet know what actions PG/SK should take upon receiving the signal, but we should have the detection and feedback in place. Add an extra `corruption_detected` field to the `PageserverFeedback` message that is sent from PS -> SK -> PG. It's a boolean value that is set to true when PS detects a "critical error" that signals data corruption, and it's sent in all `PageserverFeedback` messages. Upon receiving this signal, the safekeeper raises a `safekeeper_ps_corruption_detected` gauge metric (value set to 1). The safekeeper then forwards this signal to PG where a `ps_corruption_detected` gauge metric (value also set to 1) is raised in the `neon_perf_counters` view. Added an integration test in `test_compaction.py::test_ps_corruption_detection_feedback` that confirms that the safekeeper and PG can receive the data corruption signal in the `PageserverFeedback` message in a simulated data corruption. ## Problem ## Summary of changes --------- Co-authored-by: William Huang --- libs/utils/src/logging.rs | 5 +- libs/utils/src/pageserver_feedback.rs | 36 ++++++++ libs/walproposer/src/api_bindings.rs | 1 + pageserver/src/tenant/timeline.rs | 18 ++++ pageserver/src/tenant/timeline/compaction.rs | 2 + .../walreceiver/walreceiver_connection.rs | 6 ++ pageserver/src/walingest.rs | 7 ++ pgxn/neon/neon_perf_counters.c | 2 + pgxn/neon/neon_perf_counters.h | 1 + pgxn/neon/walproposer.c | 6 ++ pgxn/neon/walproposer.h | 2 + pgxn/neon/walproposer_pg.c | 6 ++ safekeeper/src/hadron.rs | 1 + safekeeper/src/metrics.rs | 16 ++++ safekeeper/src/send_interpreted_wal.rs | 7 ++ safekeeper/src/send_wal.rs | 9 ++ safekeeper/src/timeline.rs | 2 + test_runner/regress/test_compaction.py | 82 +++++++++++++++++++ 18 files changed, 208 insertions(+), 1 deletion(-) diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index d67c0f123b..9f118048f3 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -34,13 +34,16 @@ macro_rules! critical { #[macro_export] macro_rules! critical_timeline { - ($tenant_shard_id:expr, $timeline_id:expr, $($arg:tt)*) => {{ + ($tenant_shard_id:expr, $timeline_id:expr, $corruption_detected:expr, $($arg:tt)*) => {{ if cfg!(debug_assertions) { panic!($($arg)*); } // Increment both metrics $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical(); $crate::logging::HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC.inc(&$tenant_shard_id.to_string(), &$timeline_id.to_string()); + if let Some(c) = $corruption_detected.as_ref() { + c.store(true, std::sync::atomic::Ordering::Relaxed); + } let backtrace = std::backtrace::Backtrace::capture(); tracing::error!("CRITICAL: [tenant_shard_id: {}, timeline_id: {}] {}\n{backtrace}", $tenant_shard_id, $timeline_id, format!($($arg)*)); diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs index cffbc0b4d6..da5b53306a 100644 --- a/libs/utils/src/pageserver_feedback.rs +++ b/libs/utils/src/pageserver_feedback.rs @@ -32,6 +32,9 @@ pub struct PageserverFeedback { pub replytime: SystemTime, /// Used to track feedbacks from different shards. Always zero for unsharded tenants. pub shard_number: u32, + /// If true, the pageserver has detected corruption and the safekeeper and postgres + /// should stop sending WAL. + pub corruption_detected: bool, } impl PageserverFeedback { @@ -43,6 +46,7 @@ impl PageserverFeedback { disk_consistent_lsn: Lsn::INVALID, replytime: *PG_EPOCH, shard_number: 0, + corruption_detected: false, } } @@ -101,6 +105,13 @@ impl PageserverFeedback { buf.put_u32(self.shard_number); } + if self.corruption_detected { + nkeys += 1; + buf.put_slice(b"corruption_detected\0"); + buf.put_i32(1); + buf.put_u8(1); + } + buf[buf_ptr] = nkeys; } @@ -147,6 +158,11 @@ impl PageserverFeedback { assert_eq!(len, 4); rf.shard_number = buf.get_u32(); } + b"corruption_detected" => { + let len = buf.get_i32(); + assert_eq!(len, 1); + rf.corruption_detected = buf.get_u8() != 0; + } _ => { let len = buf.get_i32(); warn!( @@ -206,6 +222,26 @@ mod tests { assert_eq!(rf, rf_parsed); } + // Test that databricks-specific fields added to the PageserverFeedback message are serialized + // and deserialized correctly, in addition to the existing fields from upstream. + #[test] + fn test_replication_feedback_databricks_fields() { + let mut rf = PageserverFeedback::empty(); + rf.current_timeline_size = 12345678; + rf.last_received_lsn = Lsn(23456789); + rf.disk_consistent_lsn = Lsn(34567890); + rf.remote_consistent_lsn = Lsn(45678901); + rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.shard_number = 1; + rf.corruption_detected = true; + + let mut data = BytesMut::new(); + rf.serialize(&mut data); + + let rf_parsed = PageserverFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); + } + #[test] fn test_replication_feedback_unknown_key() { let mut rf = PageserverFeedback::empty(); diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index c3be1e1dae..9f88ea6b11 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -426,6 +426,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { remote_consistent_lsn: 0, replytime: 0, shard_number: 0, + corruption_detected: false, }; let empty_wal_rate_limiter = crate::bindings::WalRateLimiter { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2c70c5cfa5..ff66b0ecc8 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -397,6 +397,11 @@ pub struct Timeline { /// If true, the last compaction failed. compaction_failed: AtomicBool, + /// Begin Hadron: If true, the pageserver has likely detected data corruption in the timeline. + /// We need to feed this information back to the Safekeeper and postgres for them to take the + /// appropriate action. + corruption_detected: AtomicBool, + /// Notifies the tenant compaction loop that there is pending L0 compaction work. l0_compaction_trigger: Arc, @@ -3310,6 +3315,7 @@ impl Timeline { compaction_lock: tokio::sync::Mutex::default(), compaction_failed: AtomicBool::default(), + corruption_detected: AtomicBool::default(), l0_compaction_trigger: resources.l0_compaction_trigger, gc_lock: tokio::sync::Mutex::default(), @@ -6004,6 +6010,17 @@ impl Timeline { ))) }); + // Begin Hadron + // + fail_point!("create-image-layer-fail-simulated-corruption", |_| { + self.corruption_detected + .store(true, std::sync::atomic::Ordering::Relaxed); + Err(CreateImageLayersError::Other(anyhow::anyhow!( + "failpoint create-image-layer-fail-simulated-corruption" + ))) + }); + // End Hadron + let io_concurrency = IoConcurrency::spawn_from_conf( self.conf.get_vectored_concurrent_io, self.gate @@ -7149,6 +7166,7 @@ impl Timeline { critical_timeline!( self.tenant_shard_id, self.timeline_id, + Some(&self.corruption_detected), "walredo failure during page reconstruction: {err:?}" ); } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9bca952a46..c5363d84b7 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1397,6 +1397,7 @@ impl Timeline { critical_timeline!( self.tenant_shard_id, self.timeline_id, + Some(&self.corruption_detected), "missing key during compaction: {err:?}" ); } @@ -1441,6 +1442,7 @@ impl Timeline { critical_timeline!( self.tenant_shard_id, self.timeline_id, + Some(&self.corruption_detected), "could not compact, repartitioning keyspace failed: {e:?}" ); } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index f619c69599..7ec5aa3b77 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -365,6 +365,7 @@ pub(super) async fn handle_walreceiver_connection( critical_timeline!( timeline.tenant_shard_id, timeline.timeline_id, + Some(&timeline.corruption_detected), "{msg}" ); return Err(WalReceiverError::Other(anyhow!(msg))); @@ -382,6 +383,7 @@ pub(super) async fn handle_walreceiver_connection( critical_timeline!( timeline.tenant_shard_id, timeline.timeline_id, + Some(&timeline.corruption_detected), "{msg}" ); return Err(WalReceiverError::Other(anyhow!(msg))); @@ -455,6 +457,7 @@ pub(super) async fn handle_walreceiver_connection( critical_timeline!( timeline.tenant_shard_id, timeline.timeline_id, + Some(&timeline.corruption_detected), "{err:?}" ); } @@ -586,6 +589,9 @@ pub(super) async fn handle_walreceiver_connection( remote_consistent_lsn, replytime: ts, shard_number: timeline.tenant_shard_id.shard_number.0 as u32, + corruption_detected: timeline + .corruption_detected + .load(std::sync::atomic::Ordering::Relaxed), }; debug!("neon_status_update {status_update:?}"); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 3acf98b020..c364334dab 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -23,6 +23,7 @@ use std::backtrace::Backtrace; use std::collections::HashMap; +use std::sync::atomic::AtomicBool; use std::sync::{Arc, OnceLock}; use std::time::{Duration, Instant, SystemTime}; @@ -422,6 +423,8 @@ impl WalIngest { critical_timeline!( modification.tline.tenant_shard_id, modification.tline.timeline_id, + // Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it. + None::<&AtomicBool>, "clear_vm_bits for unknown VM relation {vm_rel}" ); return Ok(()); @@ -431,6 +434,8 @@ impl WalIngest { critical_timeline!( modification.tline.tenant_shard_id, modification.tline.timeline_id, + // Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it. + None::<&AtomicBool>, "new_vm_blk {blknum} not in {vm_rel} of size {vm_size}" ); new_vm_blk = None; @@ -441,6 +446,8 @@ impl WalIngest { critical_timeline!( modification.tline.tenant_shard_id, modification.tline.timeline_id, + // Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it. + None::<&AtomicBool>, "old_vm_blk {blknum} not in {vm_rel} of size {vm_size}" ); old_vm_blk = None; diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index a38f876a0c..fada4cba1e 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -45,6 +45,7 @@ DatabricksMetricsShmemInit(void) pg_atomic_init_u32(&databricks_metrics_shared->index_corruption_count, 0); pg_atomic_init_u32(&databricks_metrics_shared->data_corruption_count, 0); pg_atomic_init_u32(&databricks_metrics_shared->internal_error_count, 0); + pg_atomic_init_u32(&databricks_metrics_shared->ps_corruption_detected, 0); } } /* END_HADRON */ @@ -440,6 +441,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) {"sql_index_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->index_corruption_count)}, {"sql_data_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->data_corruption_count)}, {"sql_internal_error_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->internal_error_count)}, + {"ps_corruption_detected", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->ps_corruption_detected)}, {NULL, false, 0, 0}, }; for (int i = 0; databricks_metrics[i].name != NULL; i++) diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 0196559806..5c0b7ded7a 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -183,6 +183,7 @@ typedef struct pg_atomic_uint32 index_corruption_count; pg_atomic_uint32 data_corruption_count; pg_atomic_uint32 internal_error_count; + pg_atomic_uint32 ps_corruption_detected; } databricks_metrics; extern databricks_metrics *databricks_metrics_shared; diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index ba6e4a54ff..c85a6f4b6f 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1887,6 +1887,12 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32)); psfeedback_log("%u", key, ps_feedback->shard_number); } + else if (strcmp(key, "corruption_detected") == 0) + { + Assert(value_len == 1); + ps_feedback->corruption_detected = pq_getmsgbyte(reply_message) != 0; + psfeedback_log("%s", key, ps_feedback->corruption_detected ? "true" : "false"); + } else { /* diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 5507294c3b..d6cd532bec 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -374,6 +374,8 @@ typedef struct PageserverFeedback XLogRecPtr remote_consistent_lsn; TimestampTz replytime; uint32 shard_number; + /* true if the pageserver has detected data corruption in the timeline */ + bool corruption_detected; } PageserverFeedback; /* BEGIN_HADRON */ diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index b0f5828d39..da86c5d498 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -49,6 +49,7 @@ #include "libpqwalproposer.h" #include "neon.h" +#include "neon_perf_counters.h" #include "neon_walreader.h" #include "walproposer.h" @@ -741,6 +742,11 @@ record_pageserver_feedback(PageserverFeedback *ps_feedback, shardno_t num_shards Assert(ps_feedback->shard_number < MAX_SHARDS); Assert(ps_feedback->shard_number < num_shards); + // Begin Hadron: Record any corruption signal from the pageserver first. + if (ps_feedback->corruption_detected) { + pg_atomic_write_u32(&databricks_metrics_shared->ps_corruption_detected, 1); + } + SpinLockAcquire(&walprop_shared->mutex); // Hadron: Update the num_shards from the source-of-truth (shard map) lazily when we receive diff --git a/safekeeper/src/hadron.rs b/safekeeper/src/hadron.rs index f41fe2512d..72b377fcc4 100644 --- a/safekeeper/src/hadron.rs +++ b/safekeeper/src/hadron.rs @@ -387,6 +387,7 @@ pub fn get_filesystem_usage(path: &std::path::Path) -> u64 { critical_timeline!( placeholder_ttid.tenant_id, placeholder_ttid.timeline_id, + None::<&AtomicBool>, "Global disk usage watcher failed to read filesystem usage: {:?}", e ); diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index b07852aaee..08d96a7aa6 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -518,6 +518,7 @@ pub async fn time_io_closure>( pub struct FullTimelineInfo { pub ttid: TenantTimelineId, pub ps_feedback_count: u64, + pub ps_corruption_detected: bool, pub last_ps_feedback: PageserverFeedback, pub wal_backup_active: bool, pub timeline_is_active: bool, @@ -547,6 +548,7 @@ pub struct TimelineCollector { ps_last_received_lsn: GenericGaugeVec, feedback_last_time_seconds: GenericGaugeVec, ps_feedback_count: GenericGaugeVec, + ps_corruption_detected: IntGaugeVec, timeline_active: GenericGaugeVec, wal_backup_active: GenericGaugeVec, connected_computes: IntGaugeVec, @@ -654,6 +656,15 @@ impl TimelineCollector { ) .unwrap(); + let ps_corruption_detected = IntGaugeVec::new( + Opts::new( + "safekeeper_ps_corruption_detected", + "1 if corruption was detected in the timeline according to feedback from the pageserver, 0 otherwise", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + let timeline_active = GenericGaugeVec::new( Opts::new( "safekeeper_timeline_active", @@ -774,6 +785,7 @@ impl TimelineCollector { ps_last_received_lsn, feedback_last_time_seconds, ps_feedback_count, + ps_corruption_detected, timeline_active, wal_backup_active, connected_computes, @@ -892,6 +904,9 @@ impl Collector for TimelineCollector { self.ps_feedback_count .with_label_values(labels) .set(tli.ps_feedback_count); + self.ps_corruption_detected + .with_label_values(labels) + .set(tli.ps_corruption_detected as i64); if let Ok(unix_time) = tli .last_ps_feedback .replytime @@ -925,6 +940,7 @@ impl Collector for TimelineCollector { mfs.extend(self.ps_last_received_lsn.collect()); mfs.extend(self.feedback_last_time_seconds.collect()); mfs.extend(self.ps_feedback_count.collect()); + mfs.extend(self.ps_corruption_detected.collect()); mfs.extend(self.timeline_active.collect()); mfs.extend(self.wal_backup_active.collect()); mfs.extend(self.connected_computes.collect()); diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 671798298b..bfc4008c52 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::fmt::Display; use std::sync::Arc; +use std::sync::atomic::AtomicBool; use std::time::Duration; use anyhow::{Context, anyhow}; @@ -305,6 +306,9 @@ impl InterpretedWalReader { critical_timeline!( ttid.tenant_id, ttid.timeline_id, + // Hadron: The corruption flag is only used in PS so that it can feed this information back to SKs. + // We do not use these flags in SKs. + None::<&AtomicBool>, "failed to read WAL record: {err:?}" ); } @@ -375,6 +379,9 @@ impl InterpretedWalReader { critical_timeline!( ttid.tenant_id, ttid.timeline_id, + // Hadron: The corruption flag is only used in PS so that it can feed this information back to SKs. + // We do not use these flags in SKs. + None::<&AtomicBool>, "failed to decode WAL record: {err:?}" ); } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 5891fa88a4..2d6f7486a9 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -55,6 +55,7 @@ pub struct WalSenders { pub struct WalSendersTimelineMetricValues { pub ps_feedback_counter: u64, + pub ps_corruption_detected: bool, pub last_ps_feedback: PageserverFeedback, pub interpreted_wal_reader_tasks: usize, } @@ -193,6 +194,7 @@ impl WalSenders { WalSendersTimelineMetricValues { ps_feedback_counter: shared.ps_feedback_counter, + ps_corruption_detected: shared.ps_corruption_detected, last_ps_feedback: shared.last_ps_feedback, interpreted_wal_reader_tasks, } @@ -209,6 +211,9 @@ impl WalSenders { *shared.get_slot_mut(id).get_mut_feedback() = ReplicationFeedback::Pageserver(*feedback); shared.last_ps_feedback = *feedback; shared.ps_feedback_counter += 1; + if feedback.corruption_detected { + shared.ps_corruption_detected = true; + } drop(shared); RECEIVED_PS_FEEDBACKS.inc(); @@ -278,6 +283,9 @@ struct WalSendersShared { last_ps_feedback: PageserverFeedback, // total counter of pageserver feedbacks received ps_feedback_counter: u64, + // Hadron: true iff we received a pageserver feedback that incidated + // data corruption in the timeline + ps_corruption_detected: bool, slots: Vec>, } @@ -328,6 +336,7 @@ impl WalSendersShared { agg_standby_feedback: StandbyFeedback::empty(), last_ps_feedback: PageserverFeedback::empty(), ps_feedback_counter: 0, + ps_corruption_detected: false, slots: Vec::new(), } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 43b5b3a8d3..e083a49428 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -839,6 +839,7 @@ impl Timeline { let WalSendersTimelineMetricValues { ps_feedback_counter, + ps_corruption_detected, last_ps_feedback, interpreted_wal_reader_tasks, } = self.walsenders.info_for_metrics(); @@ -847,6 +848,7 @@ impl Timeline { Some(FullTimelineInfo { ttid: self.ttid, ps_feedback_count: ps_feedback_counter, + ps_corruption_detected, last_ps_feedback, wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), timeline_is_active: self.broker_active.load(Ordering::Relaxed), diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index be82ee806f..a780aa2e52 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -863,6 +863,88 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*") +@pytest.mark.skip(reason="Lakebase mode") +def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder): + """ + Test that when the pageserver detects corruption during image layer creation, + it sends corruption feedback to the safekeeper which gets recorded in its + safekeeper_ps_corruption_detected metric. + """ + # Configure tenant with aggressive compaction settings to easily trigger compaction + TENANT_CONF = { + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024 * 128, + # Compact small layers + "compaction_target_size": 1024 * 128, + # Create image layers eagerly + "image_creation_threshold": 1, + "image_layer_creation_check_threshold": 0, + # Force frequent compaction + "compaction_period": "1s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + # We are simulating compaction failures so we should allow these error messages. + env.pageserver.allowed_errors.append(".*Compaction failed.*") + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageserver_http = env.pageserver.http_client() + workload = Workload(env, tenant_id, timeline_id) + workload.init() + + # Enable the failpoint that will cause image layer creation to fail due to a (simulated) detected + # corruption. + pageserver_http.configure_failpoints(("create-image-layer-fail-simulated-corruption", "return")) + + # Write some data to trigger compaction and image layer creation + log.info("Writing data to trigger compaction...") + workload.write_rows(1024 * 64, upload=False) + workload.write_rows(1024 * 64, upload=False) + + # Returns True if the corruption signal from PS is propagated to the SK according to the "safekeeper_ps_corruption_detected" metric. + # Raises an exception otherwise. + def check_corruption_signal_propagated_to_sk(): + # Get metrics from all safekeepers + for sk in env.safekeepers: + sk_metrics = sk.http_client().get_metrics() + # Look for our corruption detected metric with the right tenant and timeline + corruption_metrics = sk_metrics.query_all("safekeeper_ps_corruption_detected") + + for metric in corruption_metrics: + # Check if there's a metric for our tenant and timeline that has value 1 + if ( + metric.labels.get("tenant_id") == str(tenant_id) + and metric.labels.get("timeline_id") == str(timeline_id) + and metric.value == 1 + ): + log.info(f"Corruption detected by safekeeper {sk.id}: {metric}") + return True + raise Exception("Corruption detection feedback not found in any safekeeper metrics") + + # Returns True if the corruption signal from PS is propagated to the PG according to the "ps_corruption_detected" metric + # in "neon_perf_counters". + # Raises an exception otherwise. + def check_corruption_signal_propagated_to_pg(): + endpoint = workload.endpoint() + results = endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon") + results = endpoint.safe_psql( + "SELECT value FROM neon_perf_counters WHERE metric = 'ps_corruption_detected'" + ) + log.info("Query corruption detection metric, results: %s", results) + if results[0][0] == 1: + log.info("Corruption detection signal is raised on Postgres") + return True + raise Exception("Corruption detection signal is not raise on Postgres") + + # Confirm that the corruption signal propagates to both the safekeeper and Postgres + wait_until(check_corruption_signal_propagated_to_sk, timeout=10, interval=0.1) + wait_until(check_corruption_signal_propagated_to_pg, timeout=10, interval=0.1) + + # Cleanup the failpoint + pageserver_http.configure_failpoints(("create-image-layer-fail-simulated-corruption", "off")) + + @pytest.mark.parametrize("enabled", [True, False]) def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool): tenant_conf = { From ca88521653fcd574ecd1eae49a5d861139d3e4d2 Mon Sep 17 00:00:00 2001 From: HaoyuHuang Date: Tue, 29 Jul 2025 14:30:34 -0700 Subject: [PATCH 19/34] Set neon_superuser privilege under lakebase mode (#12775) ## Problem ## Summary of changes --- compute_tools/src/spec_apply.rs | 7 ++++++- compute_tools/src/sql/create_privileged_role.sql | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 2356078703..00f34c1f0e 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -679,7 +679,12 @@ async fn get_operations<'a>( ApplySpecPhase::CreatePrivilegedRole => Ok(Box::new(once(Operation { query: format!( include_str!("sql/create_privileged_role.sql"), - privileged_role_name = params.privileged_role_name + privileged_role_name = params.privileged_role_name, + privileges = if params.lakebase_mode { + "CREATEDB CREATEROLE NOLOGIN BYPASSRLS" + } else { + "CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS" + } ), comment: None, }))), diff --git a/compute_tools/src/sql/create_privileged_role.sql b/compute_tools/src/sql/create_privileged_role.sql index df27ac32fc..ac2521445f 100644 --- a/compute_tools/src/sql/create_privileged_role.sql +++ b/compute_tools/src/sql/create_privileged_role.sql @@ -2,7 +2,7 @@ DO $$ BEGIN IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}') THEN - CREATE ROLE {privileged_role_name} CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data; + CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data; END IF; END $$; From 1dce2a9e746edf7b93ce1048ebf63bf5c1395c18 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 30 Jul 2025 01:20:05 +0300 Subject: [PATCH 20/34] Change how pageserver connection info is passed in compute spec (#12604) Add a new 'pageserver_connection_info' field in the compute spec. It replaces the old 'pageserver_connstring' field with a more complicated struct that includes both libpq and grpc URLs, for each shard (or only one of the the URLs, depending on the configuration). It also includes a flag suggesting which one to use; compute_ctl now uses it to decide which protocol to use for the basebackup. This is backwards-compatible with everything that's in production. If the control plane fills in `pageserver_connection_info`, compute_ctl uses that. If it fills in the `pageserver_connstring`/`shard_stripe_size` fields, it uses those. As last resort, it uses the 'neon.pageserver_connstring' GUC from the list of Postgres settings. The 'grpc' flag in the endpoint config is now more of a suggestion, and it's used to populate the 'prefer_protocol' flag in the compute spec. Regardless of the flag, compute_ctl gets both URLs, so it can choose to use libpq or grpc as it wishes. It currently always obeys the flag to choose which method to use for getting the basebackup, but Postgres itself will always use the libpq protocol. (That will be changed with the new rust-based communicator project, which implements the gRPC client in the compute). After that, the `pageserver_connection_info.prefer_protocol` flag in the spec file can be used to control whether compute_ctl uses grpc or libpq. The actual compute's grpc usage will be controlled by the `neon.enable_new_communicator` GUC (not yet; that will be introduced in the future, with the new rust-base communicator project). It can be set separately from 'prefer_protocol'. Later: - Once all old computes are gone, remove the code to pass `neon.pageserver_connstring` --- compute_tools/src/compute.rs | 133 ++++++++++-------- compute_tools/src/config.rs | 72 +++++++++- compute_tools/src/configurator.rs | 7 +- compute_tools/src/lsn_lease.rs | 61 ++++---- control_plane/src/bin/neon_local.rs | 170 ++++++++-------------- control_plane/src/endpoint.rs | 187 +++++++++++++++++++------ libs/compute_api/src/spec.rs | 172 ++++++++++++++++++++++- libs/utils/src/shard.rs | 4 + pgxn/neon/libpagestore.c | 8 +- pgxn/neon/pagestore_client.h | 2 +- storage_controller/src/compute_hook.rs | 82 ++++++++--- test_runner/regress/test_basebackup.py | 6 +- 12 files changed, 628 insertions(+), 276 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 27d33d8cd8..1033670e2b 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -7,7 +7,7 @@ use compute_api::responses::{ }; use compute_api::spec::{ ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, GenericOption, - PageserverProtocol, PgIdent, Role, + PageserverConnectionInfo, PageserverProtocol, PgIdent, Role, }; use futures::StreamExt; use futures::future::join_all; @@ -38,7 +38,7 @@ use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::measured_stream::MeasuredReader; use utils::pid_file; -use utils::shard::{ShardCount, ShardIndex, ShardNumber}; +use utils::shard::{ShardIndex, ShardNumber, ShardStripeSize}; use crate::configurator::launch_configurator; use crate::disk_quota::set_disk_quota; @@ -249,7 +249,7 @@ pub struct ParsedSpec { pub spec: ComputeSpec, pub tenant_id: TenantId, pub timeline_id: TimelineId, - pub pageserver_connstr: String, + pub pageserver_conninfo: PageserverConnectionInfo, pub safekeeper_connstrings: Vec, pub storage_auth_token: Option, /// k8s dns name and port @@ -297,25 +297,47 @@ impl ParsedSpec { } impl TryFrom for ParsedSpec { - type Error = String; - fn try_from(spec: ComputeSpec) -> Result { + type Error = anyhow::Error; + fn try_from(spec: ComputeSpec) -> Result { // Extract the options from the spec file that are needed to connect to // the storage system. // - // For backwards-compatibility, the top-level fields in the spec file - // may be empty. In that case, we need to dig them from the GUCs in the - // cluster.settings field. - let pageserver_connstr = spec - .pageserver_connstring - .clone() - .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring")) - .ok_or("pageserver connstr should be provided")?; + // In compute specs generated by old control plane versions, the spec file might + // be missing the `pageserver_connection_info` field. In that case, we need to dig + // the pageserver connection info from the `pageserver_connstr` field instead, or + // if that's missing too, from the GUC in the cluster.settings field. + let mut pageserver_conninfo = spec.pageserver_connection_info.clone(); + if pageserver_conninfo.is_none() { + if let Some(pageserver_connstr_field) = &spec.pageserver_connstring { + pageserver_conninfo = Some(PageserverConnectionInfo::from_connstr( + pageserver_connstr_field, + spec.shard_stripe_size, + )?); + } + } + if pageserver_conninfo.is_none() { + if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") { + let stripe_size = if let Some(guc) = spec.cluster.settings.find("neon.stripe_size") + { + Some(ShardStripeSize(u32::from_str(&guc)?)) + } else { + None + }; + pageserver_conninfo = + Some(PageserverConnectionInfo::from_connstr(&guc, stripe_size)?); + } + } + let pageserver_conninfo = pageserver_conninfo.ok_or(anyhow::anyhow!( + "pageserver connection information should be provided" + ))?; + + // Similarly for safekeeper connection strings let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() { if matches!(spec.mode, ComputeMode::Primary) { spec.cluster .settings .find("neon.safekeepers") - .ok_or("safekeeper connstrings should be provided")? + .ok_or(anyhow::anyhow!("safekeeper connstrings should be provided"))? .split(',') .map(|str| str.to_string()) .collect() @@ -330,22 +352,22 @@ impl TryFrom for ParsedSpec { let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id { tenant_id } else { - spec.cluster + let guc = spec + .cluster .settings .find("neon.tenant_id") - .ok_or("tenant id should be provided") - .map(|s| TenantId::from_str(&s))? - .or(Err("invalid tenant id"))? + .ok_or(anyhow::anyhow!("tenant id should be provided"))?; + TenantId::from_str(&guc).context("invalid tenant id")? }; let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id { timeline_id } else { - spec.cluster + let guc = spec + .cluster .settings .find("neon.timeline_id") - .ok_or("timeline id should be provided") - .map(|s| TimelineId::from_str(&s))? - .or(Err("invalid timeline id"))? + .ok_or(anyhow::anyhow!("timeline id should be provided"))?; + TimelineId::from_str(&guc).context(anyhow::anyhow!("invalid timeline id"))? }; let endpoint_storage_addr: Option = spec @@ -359,7 +381,7 @@ impl TryFrom for ParsedSpec { let res = ParsedSpec { spec, - pageserver_connstr, + pageserver_conninfo, safekeeper_connstrings, storage_auth_token, tenant_id, @@ -369,7 +391,7 @@ impl TryFrom for ParsedSpec { }; // Now check validity of the parsed specification - res.validate()?; + res.validate().map_err(anyhow::Error::msg)?; Ok(res) } } @@ -1195,12 +1217,10 @@ impl ComputeNode { fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); - let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap(); let started = Instant::now(); - - let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? { - PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?, + let (connected, size) = match spec.pageserver_conninfo.prefer_protocol { PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?, + PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?, }; self.fix_zenith_signal_neon_signal()?; @@ -1238,23 +1258,20 @@ impl ComputeNode { /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when /// the connection was established, and the (compressed) size of the basebackup. fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> { - let shard0_connstr = spec - .pageserver_connstr - .split(',') - .next() - .unwrap() - .to_string(); - let shard_index = match spec.pageserver_connstr.split(',').count() as u8 { - 0 | 1 => ShardIndex::unsharded(), - count => ShardIndex::new(ShardNumber(0), ShardCount(count)), + let shard0_index = ShardIndex { + shard_number: ShardNumber(0), + shard_count: spec.pageserver_conninfo.shard_count, }; - + let shard0_url = spec + .pageserver_conninfo + .shard_url(ShardNumber(0), PageserverProtocol::Grpc)? + .to_owned(); let (reader, connected) = tokio::runtime::Handle::current().block_on(async move { let mut client = page_api::Client::connect( - shard0_connstr, + shard0_url, spec.tenant_id, spec.timeline_id, - shard_index, + shard0_index, spec.storage_auth_token.clone(), None, // NB: base backups use payload compression ) @@ -1286,7 +1303,9 @@ impl ComputeNode { /// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp /// when the connection was established, and the (compressed) size of the basebackup. fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> { - let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap(); + let shard0_connstr = spec + .pageserver_conninfo + .shard_url(ShardNumber(0), PageserverProtocol::Libpq)?; let mut config = postgres::Config::from_str(shard0_connstr)?; // Use the storage auth token from the config file, if given. @@ -1373,10 +1392,7 @@ impl ComputeNode { return result; } Err(ref e) if attempts < max_attempts => { - warn!( - "Failed to get basebackup: {} (attempt {}/{})", - e, attempts, max_attempts - ); + warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})"); std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64)); retry_period_ms *= 1.5; } @@ -1589,16 +1605,8 @@ impl ComputeNode { } }; - info!( - "getting basebackup@{} from pageserver {}", - lsn, &pspec.pageserver_connstr - ); - self.get_basebackup(compute_state, lsn).with_context(|| { - format!( - "failed to get basebackup@{} from pageserver {}", - lsn, &pspec.pageserver_connstr - ) - })?; + self.get_basebackup(compute_state, lsn) + .with_context(|| format!("failed to get basebackup@{lsn}"))?; if let Some(settings) = databricks_settings { copy_tls_certificates( @@ -2642,22 +2650,22 @@ LIMIT 100", /// The operation will time out after a specified duration. pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) { let state = self.state.lock().unwrap(); - let old_pageserver_connstr = state + let old_pageserver_conninfo = state .pspec .as_ref() .expect("spec must be set") - .pageserver_connstr + .pageserver_conninfo .clone(); let mut unchanged = true; let _ = self .state_changed .wait_timeout_while(state, duration, |s| { - let pageserver_connstr = &s + let pageserver_conninfo = &s .pspec .as_ref() .expect("spec must be set") - .pageserver_connstr; - unchanged = pageserver_connstr == &old_pageserver_connstr; + .pageserver_conninfo; + unchanged = pageserver_conninfo == &old_pageserver_conninfo; unchanged }) .unwrap(); @@ -2915,7 +2923,10 @@ mod tests { match ParsedSpec::try_from(spec.clone()) { Ok(_p) => panic!("Failed to detect duplicate entry"), - Err(e) => assert!(e.starts_with("duplicate entry in safekeeper_connstrings:")), + Err(e) => assert!( + e.to_string() + .starts_with("duplicate entry in safekeeper_connstrings:") + ), }; } } diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 55a1eda0b7..e7dde5c5f5 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -18,6 +18,8 @@ use crate::pg_helpers::{ }; use crate::tls::{self, SERVER_CRT, SERVER_KEY}; +use utils::shard::{ShardIndex, ShardNumber}; + /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. pub fn line_in_file(path: &Path, line: &str) -> Result { @@ -69,9 +71,75 @@ pub fn write_postgres_conf( } // Add options for connecting to storage writeln!(file, "# Neon storage settings")?; - if let Some(s) = &spec.pageserver_connstring { - writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; + writeln!(file)?; + if let Some(conninfo) = &spec.pageserver_connection_info { + let mut libpq_urls: Option> = Some(Vec::new()); + let num_shards = if conninfo.shard_count.0 == 0 { + 1 // unsharded, treat it as a single shard + } else { + conninfo.shard_count.0 + }; + + for shard_number in 0..num_shards { + let shard_index = ShardIndex { + shard_number: ShardNumber(shard_number), + shard_count: conninfo.shard_count, + }; + let info = conninfo.shards.get(&shard_index).ok_or_else(|| { + anyhow::anyhow!( + "shard {shard_index} missing from pageserver_connection_info shard map" + ) + })?; + + let first_pageserver = info + .pageservers + .first() + .expect("must have at least one pageserver"); + + // Add the libpq URL to the array, or if the URL is missing, reset the array + // forgetting any previous entries. All servers must have a libpq URL, or none + // at all. + if let Some(url) = &first_pageserver.libpq_url { + if let Some(ref mut urls) = libpq_urls { + urls.push(url.clone()); + } + } else { + libpq_urls = None + } + } + if let Some(libpq_urls) = libpq_urls { + writeln!( + file, + "# derived from compute spec's pageserver_conninfo field" + )?; + writeln!( + file, + "neon.pageserver_connstring={}", + escape_conf_value(&libpq_urls.join(",")) + )?; + } else { + writeln!(file, "# no neon.pageserver_connstring")?; + } + + if let Some(stripe_size) = conninfo.stripe_size { + writeln!( + file, + "# from compute spec's pageserver_conninfo.stripe_size field" + )?; + writeln!(file, "neon.stripe_size={stripe_size}")?; + } + } else { + if let Some(s) = &spec.pageserver_connstring { + writeln!(file, "# from compute spec's pageserver_connstring field")?; + writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; + } + + if let Some(stripe_size) = spec.shard_stripe_size { + writeln!(file, "# from compute spec's shard_stripe_size field")?; + writeln!(file, "neon.stripe_size={stripe_size}")?; + } } + if !spec.safekeeper_connstrings.is_empty() { let mut neon_safekeepers_value = String::new(); tracing::info!( diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index feca8337b2..79eb80c4a0 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -122,8 +122,11 @@ fn configurator_main_loop(compute: &Arc) { // into the type system. assert_eq!(state.status, ComputeStatus::RefreshConfiguration); - if state.pspec.as_ref().map(|ps| ps.pageserver_connstr.clone()) - == Some(pspec.pageserver_connstr.clone()) + if state + .pspec + .as_ref() + .map(|ps| ps.pageserver_conninfo.clone()) + == Some(pspec.pageserver_conninfo.clone()) { info!( "Refresh configuration: Retrieved spec is the same as the current spec. Waiting for control plane to update the spec before attempting reconfiguration." diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index bb0828429d..6abfea82e0 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -4,14 +4,13 @@ use std::thread; use std::time::{Duration, SystemTime}; use anyhow::{Result, bail}; -use compute_api::spec::{ComputeMode, PageserverProtocol}; -use itertools::Itertools as _; +use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverProtocol}; use pageserver_page_api as page_api; use postgres::{NoTls, SimpleQueryMessage}; use tracing::{info, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; -use utils::shard::{ShardCount, ShardNumber, TenantShardId}; +use utils::shard::TenantShardId; use crate::compute::ComputeNode; @@ -78,17 +77,16 @@ fn acquire_lsn_lease_with_retry( loop { // Note: List of pageservers is dynamic, need to re-read configs before each attempt. - let (connstrings, auth) = { + let (conninfo, auth) = { let state = compute.state.lock().unwrap(); let spec = state.pspec.as_ref().expect("spec must be set"); ( - spec.pageserver_connstr.clone(), + spec.pageserver_conninfo.clone(), spec.storage_auth_token.clone(), ) }; - let result = - try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn); + let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn); match result { Ok(Some(res)) => { return Ok(res); @@ -112,35 +110,44 @@ fn acquire_lsn_lease_with_retry( /// Tries to acquire LSN leases on all Pageserver shards. fn try_acquire_lsn_lease( - connstrings: &str, + conninfo: PageserverConnectionInfo, auth: Option<&str>, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, ) -> Result> { - let connstrings = connstrings.split(',').collect_vec(); - let shard_count = connstrings.len(); let mut leases = Vec::new(); - for (shard_number, &connstring) in connstrings.iter().enumerate() { - let tenant_shard_id = match shard_count { - 0 | 1 => TenantShardId::unsharded(tenant_id), - shard_count => TenantShardId { - tenant_id, - shard_number: ShardNumber(shard_number as u8), - shard_count: ShardCount::new(shard_count as u8), - }, + for (shard_index, shard) in conninfo.shards.into_iter() { + let tenant_shard_id = TenantShardId { + tenant_id, + shard_number: shard_index.shard_number, + shard_count: shard_index.shard_count, }; - let lease = match PageserverProtocol::from_connstring(connstring)? { - PageserverProtocol::Libpq => { - acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)? - } - PageserverProtocol::Grpc => { - acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)? - } - }; - leases.push(lease); + // XXX: If there are more than pageserver for the one shard, do we need to get a + // leas on all of them? Currently, that's what we assume, but this is hypothetical + // as of this writing, as we never pass the info for more than one pageserver per + // shard. + for pageserver in shard.pageservers { + let lease = match conninfo.prefer_protocol { + PageserverProtocol::Grpc => acquire_lsn_lease_grpc( + &pageserver.grpc_url.unwrap(), + auth, + tenant_shard_id, + timeline_id, + lsn, + )?, + PageserverProtocol::Libpq => acquire_lsn_lease_libpq( + &pageserver.libpq_url.unwrap(), + auth, + tenant_shard_id, + timeline_id, + lsn, + )?, + }; + leases.push(lease); + } } Ok(leases.into_iter().min().flatten()) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index c126835066..95500b0b18 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -19,6 +19,9 @@ use compute_api::requests::ComputeClaimsScope; use compute_api::spec::{ComputeMode, PageserverProtocol}; use control_plane::broker::StorageBroker; use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode}; +use control_plane::endpoint::{ + local_pageserver_conf_to_conn_info, tenant_locate_response_to_conn_info, +}; use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::local_env; use control_plane::local_env::{ @@ -44,7 +47,6 @@ use pageserver_api::models::{ }; use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; -use postgres_connection::parse_host_port; use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId}; use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, @@ -52,7 +54,6 @@ use safekeeper_api::{ }; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; use tokio::task::JoinSet; -use url::Host; use utils::auth::{Claims, Scope}; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; @@ -1546,62 +1547,41 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res )?; } - let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id { - let conf = env.get_pageserver_conf(pageserver_id).unwrap(); - // Use gRPC if requested. - let pageserver = if endpoint.grpc { - let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config"); - let (host, port) = parse_host_port(grpc_addr)?; - let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT); - (PageserverProtocol::Grpc, host, port) - } else { - let (host, port) = parse_host_port(&conf.listen_pg_addr)?; - let port = port.unwrap_or(5432); - (PageserverProtocol::Libpq, host, port) - }; - // If caller is telling us what pageserver to use, this is not a tenant which is - // fully managed by storage controller, therefore not sharded. - (vec![pageserver], DEFAULT_STRIPE_SIZE) + let prefer_protocol = if endpoint.grpc { + PageserverProtocol::Grpc + } else { + PageserverProtocol::Libpq + }; + + let mut pageserver_conninfo = if let Some(ps_id) = pageserver_id { + let conf = env.get_pageserver_conf(ps_id).unwrap(); + local_pageserver_conf_to_conn_info(conf)? } else { // Look up the currently attached location of the tenant, and its striping metadata, // to pass these on to postgres. let storage_controller = StorageController::from_env(env); let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?; - let pageservers = futures::future::try_join_all( - locate_result.shards.into_iter().map(|shard| async move { - if let ComputeMode::Static(lsn) = endpoint.mode { - // Initialize LSN leases for static computes. + assert!(!locate_result.shards.is_empty()); + + // Initialize LSN leases for static computes. + if let ComputeMode::Static(lsn) = endpoint.mode { + futures::future::try_join_all(locate_result.shards.iter().map( + |shard| async move { let conf = env.get_pageserver_conf(shard.node_id).unwrap(); let pageserver = PageServerNode::from_env(env, conf); pageserver .http_client .timeline_init_lsn_lease(shard.shard_id, endpoint.timeline_id, lsn) - .await?; - } + .await + }, + )) + .await?; + } - let pageserver = if endpoint.grpc { - ( - PageserverProtocol::Grpc, - Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?, - shard.listen_grpc_port.expect("no gRPC port"), - ) - } else { - ( - PageserverProtocol::Libpq, - Host::parse(&shard.listen_pg_addr)?, - shard.listen_pg_port, - ) - }; - anyhow::Ok(pageserver) - }), - ) - .await?; - let stripe_size = locate_result.shard_params.stripe_size; - - (pageservers, stripe_size) + tenant_locate_response_to_conn_info(&locate_result)? }; - assert!(!pageservers.is_empty()); + pageserver_conninfo.prefer_protocol = prefer_protocol; let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?; let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) { @@ -1631,9 +1611,8 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res endpoint_storage_addr, safekeepers_generation, safekeepers, - pageservers, + pageserver_conninfo, remote_ext_base_url: remote_ext_base_url.clone(), - shard_stripe_size: stripe_size.0 as usize, create_test_user: args.create_test_user, start_timeout: args.start_timeout, autoprewarm: args.autoprewarm, @@ -1650,37 +1629,29 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res .endpoints .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - let pageservers = match args.pageserver_id { + let prefer_protocol = if endpoint.grpc { + PageserverProtocol::Grpc + } else { + PageserverProtocol::Libpq + }; + let mut pageserver_conninfo = match args.pageserver_id { Some(pageserver_id) => { - let pageserver = - PageServerNode::from_env(env, env.get_pageserver_conf(pageserver_id)?); - - vec![( - PageserverProtocol::Libpq, - pageserver.pg_connection_config.host().clone(), - pageserver.pg_connection_config.port(), - )] + let conf = env.get_pageserver_conf(pageserver_id)?; + local_pageserver_conf_to_conn_info(conf)? } None => { let storage_controller = StorageController::from_env(env); - storage_controller - .tenant_locate(endpoint.tenant_id) - .await? - .shards - .into_iter() - .map(|shard| { - ( - PageserverProtocol::Libpq, - Host::parse(&shard.listen_pg_addr) - .expect("Storage controller reported malformed host"), - shard.listen_pg_port, - ) - }) - .collect::>() + let locate_result = + storage_controller.tenant_locate(endpoint.tenant_id).await?; + + tenant_locate_response_to_conn_info(&locate_result)? } }; + pageserver_conninfo.prefer_protocol = prefer_protocol; - endpoint.update_pageservers_in_config(pageservers).await?; + endpoint + .update_pageservers_in_config(&pageserver_conninfo) + .await?; } EndpointCmd::Reconfigure(args) => { let endpoint_id = &args.endpoint_id; @@ -1688,51 +1659,30 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res .endpoints .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id { - let conf = env.get_pageserver_conf(ps_id)?; - // Use gRPC if requested. - let pageserver = if endpoint.grpc { - let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config"); - let (host, port) = parse_host_port(grpc_addr)?; - let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT); - (PageserverProtocol::Grpc, host, port) - } else { - let (host, port) = parse_host_port(&conf.listen_pg_addr)?; - let port = port.unwrap_or(5432); - (PageserverProtocol::Libpq, host, port) - }; - vec![pageserver] + + let prefer_protocol = if endpoint.grpc { + PageserverProtocol::Grpc } else { - let storage_controller = StorageController::from_env(env); - storage_controller - .tenant_locate(endpoint.tenant_id) - .await? - .shards - .into_iter() - .map(|shard| { - // Use gRPC if requested. - if endpoint.grpc { - ( - PageserverProtocol::Grpc, - Host::parse(&shard.listen_grpc_addr.expect("no gRPC address")) - .expect("bad hostname"), - shard.listen_grpc_port.expect("no gRPC port"), - ) - } else { - ( - PageserverProtocol::Libpq, - Host::parse(&shard.listen_pg_addr).expect("bad hostname"), - shard.listen_pg_port, - ) - } - }) - .collect::>() + PageserverProtocol::Libpq }; + let mut pageserver_conninfo = if let Some(ps_id) = args.endpoint_pageserver_id { + let conf = env.get_pageserver_conf(ps_id)?; + local_pageserver_conf_to_conn_info(conf)? + } else { + // Look up the currently attached location of the tenant, and its striping metadata, + // to pass these on to postgres. + let storage_controller = StorageController::from_env(env); + let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?; + + tenant_locate_response_to_conn_info(&locate_result)? + }; + pageserver_conninfo.prefer_protocol = prefer_protocol; + // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = parse_safekeepers(&args.safekeepers)?; endpoint - .reconfigure(Some(pageservers), None, safekeepers, None) + .reconfigure(Some(&pageserver_conninfo), safekeepers, None) .await?; } EndpointCmd::RefreshConfiguration(args) => { diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 1c7f489d68..814ee2a52f 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -37,7 +37,7 @@ //! //! ``` //! -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashMap}; use std::fmt::Display; use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream}; use std::path::PathBuf; @@ -58,8 +58,12 @@ use compute_api::responses::{ }; use compute_api::spec::{ Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol, - PgIdent, RemoteExtSpec, Role, + PageserverShardInfo, PgIdent, RemoteExtSpec, Role, }; + +// re-export these, because they're used in the reconfigure() function +pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo}; + use jsonwebtoken::jwk::{ AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations, OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse, @@ -74,9 +78,11 @@ use sha2::{Digest, Sha256}; use spki::der::Decode; use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef}; use tracing::debug; -use url::Host; use utils::id::{NodeId, TenantId, TimelineId}; -use utils::shard::ShardStripeSize; +use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + +use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT as DEFAULT_PAGESERVER_GRPC_PORT; +use postgres_connection::parse_host_port; use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; @@ -387,9 +393,8 @@ pub struct EndpointStartArgs { pub endpoint_storage_addr: String, pub safekeepers_generation: Option, pub safekeepers: Vec, - pub pageservers: Vec<(PageserverProtocol, Host, u16)>, + pub pageserver_conninfo: PageserverConnectionInfo, pub remote_ext_base_url: Option, - pub shard_stripe_size: usize, pub create_test_user: bool, pub start_timeout: Duration, pub autoprewarm: bool, @@ -662,14 +667,6 @@ impl Endpoint { } } - fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String { - pageservers - .iter() - .map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}")) - .collect::>() - .join(",") - } - /// Map safekeepers ids to the actual connection strings. fn build_safekeepers_connstrs(&self, sk_ids: Vec) -> Result> { let mut safekeeper_connstrings = Vec::new(); @@ -715,9 +712,6 @@ impl Endpoint { std::fs::remove_dir_all(self.pgdata())?; } - let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers); - assert!(!pageserver_connstring.is_empty()); - let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?; // check for file remote_extensions_spec.json @@ -732,6 +726,44 @@ impl Endpoint { remote_extensions = None; }; + // For the sake of backwards-compatibility, also fill in 'pageserver_connstring' + // + // XXX: I believe this is not really needed, except to make + // test_forward_compatibility happy. + // + // Use a closure so that we can conviniently return None in the middle of the + // loop. + let pageserver_connstring: Option = (|| { + let num_shards = args.pageserver_conninfo.shard_count.count(); + let mut connstrings = Vec::new(); + for shard_no in 0..num_shards { + let shard_index = ShardIndex { + shard_count: args.pageserver_conninfo.shard_count, + shard_number: ShardNumber(shard_no), + }; + let shard = args + .pageserver_conninfo + .shards + .get(&shard_index) + .ok_or_else(|| { + anyhow!( + "shard {} not found in pageserver_connection_info", + shard_index + ) + })?; + let pageserver = shard + .pageservers + .first() + .ok_or(anyhow!("must have at least one pageserver"))?; + if let Some(libpq_url) = &pageserver.libpq_url { + connstrings.push(libpq_url.clone()); + } else { + return Ok::<_, anyhow::Error>(None); + } + } + Ok(Some(connstrings.join(","))) + })()?; + // Create config file let config = { let mut spec = ComputeSpec { @@ -776,13 +808,14 @@ impl Endpoint { branch_id: None, endpoint_id: Some(self.endpoint_id.clone()), mode: self.mode, - pageserver_connstring: Some(pageserver_connstring), + pageserver_connection_info: Some(args.pageserver_conninfo.clone()), + pageserver_connstring, safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()), safekeeper_connstrings, storage_auth_token: args.auth_token.clone(), remote_extensions, pgbouncer_settings: None, - shard_stripe_size: Some(args.shard_stripe_size), + shard_stripe_size: args.pageserver_conninfo.stripe_size, // redundant with pageserver_connection_info.stripe_size local_proxy_config: None, reconfigure_concurrency: self.reconfigure_concurrency, drop_subscriptions_before_start: self.drop_subscriptions_before_start, @@ -966,7 +999,7 @@ impl Endpoint { // Update the pageservers in the spec file of the endpoint. This is useful to test the spec refresh scenario. pub async fn update_pageservers_in_config( &self, - pageservers: Vec<(PageserverProtocol, Host, u16)>, + pageserver_conninfo: &PageserverConnectionInfo, ) -> Result<()> { let config_path = self.endpoint_path().join("config.json"); let mut config: ComputeConfig = { @@ -974,10 +1007,8 @@ impl Endpoint { serde_json::from_reader(file)? }; - let pageserver_connstring = Self::build_pageserver_connstr(&pageservers); - assert!(!pageserver_connstring.is_empty()); let mut spec = config.spec.unwrap(); - spec.pageserver_connstring = Some(pageserver_connstring); + spec.pageserver_connection_info = Some(pageserver_conninfo.clone()); config.spec = Some(spec); let file = std::fs::File::create(&config_path)?; @@ -1020,8 +1051,7 @@ impl Endpoint { pub async fn reconfigure( &self, - pageservers: Option>, - stripe_size: Option, + pageserver_conninfo: Option<&PageserverConnectionInfo>, safekeepers: Option>, safekeeper_generation: Option, ) -> Result<()> { @@ -1036,15 +1066,15 @@ impl Endpoint { let postgresql_conf = self.read_postgresql_conf()?; spec.cluster.postgresql_conf = Some(postgresql_conf); - // If pageservers are not specified, don't change them. - if let Some(pageservers) = pageservers { - anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided"); - - let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); - spec.pageserver_connstring = Some(pageserver_connstr); - if stripe_size.is_some() { - spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); - } + if let Some(pageserver_conninfo) = pageserver_conninfo { + // If pageservers are provided, we need to ensure that they are not empty. + // This is a requirement for the compute_ctl configuration. + anyhow::ensure!( + !pageserver_conninfo.shards.is_empty(), + "no pageservers provided" + ); + spec.pageserver_connection_info = Some(pageserver_conninfo.clone()); + spec.shard_stripe_size = pageserver_conninfo.stripe_size; } // If safekeepers are not specified, don't change them. @@ -1093,11 +1123,9 @@ impl Endpoint { pub async fn reconfigure_pageservers( &self, - pageservers: Vec<(PageserverProtocol, Host, u16)>, - stripe_size: Option, + pageservers: &PageserverConnectionInfo, ) -> Result<()> { - self.reconfigure(Some(pageservers), stripe_size, None, None) - .await + self.reconfigure(Some(pageservers), None, None).await } pub async fn reconfigure_safekeepers( @@ -1105,7 +1133,7 @@ impl Endpoint { safekeepers: Vec, generation: SafekeeperGeneration, ) -> Result<()> { - self.reconfigure(None, None, Some(safekeepers), Some(generation)) + self.reconfigure(None, Some(safekeepers), Some(generation)) .await } @@ -1188,3 +1216,84 @@ impl Endpoint { ) } } + +/// If caller is telling us what pageserver to use, this is not a tenant which is +/// fully managed by storage controller, therefore not sharded. +pub fn local_pageserver_conf_to_conn_info( + conf: &crate::local_env::PageServerConf, +) -> Result { + let libpq_url = { + let (host, port) = parse_host_port(&conf.listen_pg_addr)?; + let port = port.unwrap_or(5432); + Some(format!("postgres://no_user@{host}:{port}")) + }; + let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr { + let (host, port) = parse_host_port(grpc_addr)?; + let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT); + Some(format!("grpc://no_user@{host}:{port}")) + } else { + None + }; + let ps_conninfo = PageserverShardConnectionInfo { + id: Some(conf.id), + libpq_url, + grpc_url, + }; + + let shard_info = PageserverShardInfo { + pageservers: vec![ps_conninfo], + }; + + let shards: HashMap<_, _> = vec![(ShardIndex::unsharded(), shard_info)] + .into_iter() + .collect(); + Ok(PageserverConnectionInfo { + shard_count: ShardCount::unsharded(), + stripe_size: None, + shards, + prefer_protocol: PageserverProtocol::default(), + }) +} + +pub fn tenant_locate_response_to_conn_info( + response: &pageserver_api::controller_api::TenantLocateResponse, +) -> Result { + let mut shards = HashMap::new(); + for shard in response.shards.iter() { + tracing::info!("parsing {}", shard.listen_pg_addr); + let libpq_url = { + let host = &shard.listen_pg_addr; + let port = shard.listen_pg_port; + Some(format!("postgres://no_user@{host}:{port}")) + }; + let grpc_url = if let Some(grpc_addr) = &shard.listen_grpc_addr { + let host = grpc_addr; + let port = shard.listen_grpc_port.expect("no gRPC port"); + Some(format!("grpc://no_user@{host}:{port}")) + } else { + None + }; + + let shard_info = PageserverShardInfo { + pageservers: vec![PageserverShardConnectionInfo { + id: Some(shard.node_id), + libpq_url, + grpc_url, + }], + }; + + shards.insert(shard.shard_id.to_index(), shard_info); + } + + let stripe_size = if response.shard_params.count.is_unsharded() { + None + } else { + Some(response.shard_params.stripe_size) + }; + Ok(PageserverConnectionInfo { + shard_count: response.shard_params.count, + stripe_size, + shards, + prefer_protocol: PageserverProtocol::default(), + }) +} diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 6709c06fc6..12d825e1bf 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -12,8 +12,9 @@ use regex::Regex; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; use url::Url; -use utils::id::{TenantId, TimelineId}; +use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; +use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize}; use crate::responses::TlsConfig; @@ -105,8 +106,27 @@ pub struct ComputeSpec { // updated to fill these fields, we can make these non optional. pub tenant_id: Option, pub timeline_id: Option, + + /// Pageserver information can be passed in three different ways: + /// 1. Here in `pageserver_connection_info` + /// 2. In the `pageserver_connstring` field. + /// 3. in `cluster.settings`. + /// + /// The goal is to use method 1. everywhere. But for backwards-compatibility with old + /// versions of the control plane, `compute_ctl` will check 2. and 3. if the + /// `pageserver_connection_info` field is missing. + /// + /// If both `pageserver_connection_info` and `pageserver_connstring`+`shard_stripe_size` are + /// given, they must contain the same information. + pub pageserver_connection_info: Option, + pub pageserver_connstring: Option, + /// Stripe size for pageserver sharding, in pages. This is set together with the legacy + /// `pageserver_connstring` field. When the modern `pageserver_connection_info` field is used, + /// the stripe size is stored in `pageserver_connection_info.stripe_size` instead. + pub shard_stripe_size: Option, + // More neon ids that we expose to the compute_ctl // and to postgres as neon extension GUCs. pub project_id: Option, @@ -139,10 +159,6 @@ pub struct ComputeSpec { pub pgbouncer_settings: Option>, - // Stripe size for pageserver sharding, in pages - #[serde(default)] - pub shard_stripe_size: Option, - /// Local Proxy configuration used for JWT authentication #[serde(default)] pub local_proxy_config: Option, @@ -217,6 +233,140 @@ pub enum ComputeFeature { UnknownFeature, } +#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] +pub struct PageserverConnectionInfo { + /// NB: 0 for unsharded tenants, 1 for sharded tenants with 1 shard, following storage + pub shard_count: ShardCount, + + /// INVARIANT: null if shard_count is 0, otherwise non-null and immutable + pub stripe_size: Option, + + pub shards: HashMap, + + /// If the compute supports both protocols, this indicates which one it should use. The compute + /// may use other available protocols too, if it doesn't support the preferred one. The URL's + /// for the protocol specified here must be present for all shards, i.e. do not mark a protocol + /// as preferred if it cannot actually be used with all the pageservers. + #[serde(default)] + pub prefer_protocol: PageserverProtocol, +} + +/// Extract PageserverConnectionInfo from a comma-separated list of libpq connection strings. +/// +/// This is used for backwards-compatibility, to parse the legacy +/// [ComputeSpec::pageserver_connstring] field, or the 'neon.pageserver_connstring' GUC. Nowadays, +/// the 'pageserver_connection_info' field should be used instead. +impl PageserverConnectionInfo { + pub fn from_connstr( + connstr: &str, + stripe_size: Option, + ) -> Result { + let shard_infos: Vec<_> = connstr + .split(',') + .map(|connstr| PageserverShardInfo { + pageservers: vec![PageserverShardConnectionInfo { + id: None, + libpq_url: Some(connstr.to_string()), + grpc_url: None, + }], + }) + .collect(); + + match shard_infos.len() { + 0 => anyhow::bail!("empty connection string"), + 1 => { + // We assume that if there's only connection string, it means "unsharded", + // rather than a sharded system with just a single shard. The latter is + // possible in principle, but we never do it. + let shard_count = ShardCount::unsharded(); + let only_shard = shard_infos.first().unwrap().clone(); + let shards = vec![(ShardIndex::unsharded(), only_shard)]; + Ok(PageserverConnectionInfo { + shard_count, + stripe_size: None, + shards: shards.into_iter().collect(), + prefer_protocol: PageserverProtocol::Libpq, + }) + } + n => { + if stripe_size.is_none() { + anyhow::bail!("{n} shards but no stripe_size"); + } + let shard_count = ShardCount(n.try_into()?); + let shards = shard_infos + .into_iter() + .enumerate() + .map(|(idx, shard_info)| { + ( + ShardIndex { + shard_count, + shard_number: ShardNumber( + idx.try_into().expect("shard number fits in u8"), + ), + }, + shard_info, + ) + }) + .collect(); + Ok(PageserverConnectionInfo { + shard_count, + stripe_size, + shards, + prefer_protocol: PageserverProtocol::Libpq, + }) + } + } + } + + /// Convenience routine to get the connection string for a shard. + pub fn shard_url( + &self, + shard_number: ShardNumber, + protocol: PageserverProtocol, + ) -> anyhow::Result<&str> { + let shard_index = ShardIndex { + shard_number, + shard_count: self.shard_count, + }; + let shard = self.shards.get(&shard_index).ok_or_else(|| { + anyhow::anyhow!("shard connection info missing for shard {}", shard_index) + })?; + + // Just use the first pageserver in the list. That's good enough for this + // convenience routine; if you need more control, like round robin policy or + // failover support, roll your own. (As of this writing, we never have more than + // one pageserver per shard anyway, but that will change in the future.) + let pageserver = shard + .pageservers + .first() + .ok_or(anyhow::anyhow!("must have at least one pageserver"))?; + + let result = match protocol { + PageserverProtocol::Grpc => pageserver + .grpc_url + .as_ref() + .ok_or(anyhow::anyhow!("no grpc_url for shard {shard_index}"))?, + PageserverProtocol::Libpq => pageserver + .libpq_url + .as_ref() + .ok_or(anyhow::anyhow!("no libpq_url for shard {shard_index}"))?, + }; + Ok(result) + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] +pub struct PageserverShardInfo { + pub pageservers: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] +pub struct PageserverShardConnectionInfo { + pub id: Option, + pub libpq_url: Option, + pub grpc_url: Option, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] pub struct RemoteExtSpec { pub public_extensions: Option>, @@ -334,6 +484,12 @@ impl ComputeMode { } } +impl Display for ComputeMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.to_type_str()) + } +} + /// Log level for audit logging #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] pub enum ComputeAudit { @@ -470,13 +626,15 @@ pub struct JwksSettings { pub jwt_audience: Option, } -/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme. -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +/// Protocol used to connect to a Pageserver. +#[derive(Clone, Copy, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub enum PageserverProtocol { /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme. #[default] + #[serde(rename = "libpq")] Libpq, /// A newer, gRPC-based protocol. Uses grpc:// scheme. + #[serde(rename = "grpc")] Grpc, } diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index 6ad6cab3a8..90323f7762 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -59,6 +59,10 @@ impl ShardCount { pub const MAX: Self = Self(u8::MAX); pub const MIN: Self = Self(0); + pub fn unsharded() -> Self { + ShardCount(0) + } + /// The internal value of a ShardCount may be zero, which means "1 shard, but use /// legacy format for TenantShardId that excludes the shard suffix", also known /// as [`TenantShardId::unsharded`]. diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 158118f860..87bdbf376c 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -71,7 +71,7 @@ char *neon_project_id; char *neon_branch_id; char *neon_endpoint_id; int32 max_cluster_size; -char *page_server_connstring; +char *pageserver_connstring; char *neon_auth_token; int readahead_buffer_size = 128; @@ -1453,7 +1453,7 @@ PagestoreShmemInit(void) pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0); pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0); memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap)); - AssignPageserverConnstring(page_server_connstring, NULL); + AssignPageserverConnstring(pageserver_connstring, NULL); } } @@ -1472,7 +1472,7 @@ pg_init_libpagestore(void) DefineCustomStringVariable("neon.pageserver_connstring", "connection string to the page server", NULL, - &page_server_connstring, + &pageserver_connstring, "", PGC_SIGHUP, 0, /* no flags required */ @@ -1643,7 +1643,7 @@ pg_init_libpagestore(void) if (neon_auth_token) neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable"); - if (page_server_connstring && page_server_connstring[0]) + if (pageserver_connstring[0]) { neon_log(PageStoreTrace, "set neon_smgr hook"); smgr_hook = smgr_neon; diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 4470d3a94d..bfe00c9285 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -236,7 +236,7 @@ extern void prefetch_on_ps_disconnect(void); extern page_server_api *page_server; -extern char *page_server_connstring; +extern char *pageserver_connstring; extern int flush_every_n_requests; extern int readahead_buffer_size; extern char *neon_timeline; diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index fb03412f3c..efeb6005d5 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -6,13 +6,16 @@ use std::time::Duration; use anyhow::Context; use compute_api::spec::PageserverProtocol; -use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; +use compute_api::spec::PageserverShardInfo; +use control_plane::endpoint::{ + ComputeControlPlane, EndpointStatus, PageserverConnectionInfo, PageserverShardConnectionInfo, +}; use control_plane::local_env::LocalEnv; use futures::StreamExt; use hyper::StatusCode; use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT; use pageserver_api::controller_api::AvailabilityZone; -use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; +use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId}; use postgres_connection::parse_host_port; use safekeeper_api::membership::SafekeeperGeneration; use serde::{Deserialize, Serialize}; @@ -506,27 +509,64 @@ impl ApiMethod for ComputeHookTenant { if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { tracing::info!("Reconfiguring pageservers for endpoint {endpoint_name}"); - let pageservers = shards - .iter() - .map(|shard| { - let ps_conf = env - .get_pageserver_conf(shard.node_id) - .expect("Unknown pageserver"); - if endpoint.grpc { - let addr = ps_conf.listen_grpc_addr.as_ref().expect("no gRPC address"); - let (host, port) = parse_host_port(addr).expect("invalid gRPC address"); - let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT); - (PageserverProtocol::Grpc, host, port) - } else { - let (host, port) = parse_host_port(&ps_conf.listen_pg_addr) - .expect("Unable to parse listen_pg_addr"); - (PageserverProtocol::Libpq, host, port.unwrap_or(5432)) - } - }) - .collect::>(); + let shard_count = match shards.len() { + 1 => ShardCount::unsharded(), + n => ShardCount(n.try_into().expect("too many shards")), + }; + + let mut shard_infos: HashMap = HashMap::new(); + + let prefer_protocol = if endpoint.grpc { + PageserverProtocol::Grpc + } else { + PageserverProtocol::Libpq + }; + + for shard in shards.iter() { + let ps_conf = env + .get_pageserver_conf(shard.node_id) + .expect("Unknown pageserver"); + + let libpq_url = Some({ + let (host, port) = parse_host_port(&ps_conf.listen_pg_addr) + .expect("Unable to parse listen_pg_addr"); + let port = port.unwrap_or(5432); + format!("postgres://no_user@{host}:{port}") + }); + let grpc_url = if let Some(grpc_addr) = &ps_conf.listen_grpc_addr { + let (host, port) = + parse_host_port(grpc_addr).expect("invalid gRPC address"); + let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT); + Some(format!("grpc://no_user@{host}:{port}")) + } else { + None + }; + let pageserver = PageserverShardConnectionInfo { + id: Some(shard.node_id), + libpq_url, + grpc_url, + }; + let shard_info = PageserverShardInfo { + pageservers: vec![pageserver], + }; + shard_infos.insert( + ShardIndex { + shard_number: shard.shard_number, + shard_count, + }, + shard_info, + ); + } + + let pageserver_conninfo = PageserverConnectionInfo { + shard_count, + stripe_size: stripe_size.map(|val| ShardStripeSize(val.0)), + shards: shard_infos, + prefer_protocol, + }; endpoint - .reconfigure_pageservers(pageservers, *stripe_size) + .reconfigure_pageservers(&pageserver_conninfo) .await .map_err(NotifyError::NeonLocal)?; } diff --git a/test_runner/regress/test_basebackup.py b/test_runner/regress/test_basebackup.py index d1b10ec85d..23b9105617 100644 --- a/test_runner/regress/test_basebackup.py +++ b/test_runner/regress/test_basebackup.py @@ -2,13 +2,15 @@ from __future__ import annotations from typing import TYPE_CHECKING +import pytest from fixtures.utils import wait_until if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnvBuilder -def test_basebackup_cache(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("grpc", [True, False]) +def test_basebackup_cache(neon_env_builder: NeonEnvBuilder, grpc: bool): """ Simple test for basebackup cache. 1. Check that we always hit the cache after compute restart. @@ -22,7 +24,7 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder): """ env = neon_env_builder.init_start() - ep = env.endpoints.create("main") + ep = env.endpoints.create("main", grpc=grpc) ps = env.pageserver ps_http = ps.http_client() From b3c1aecd115891753955773257e6fe95efb35242 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 30 Jul 2025 15:19:00 +0300 Subject: [PATCH 21/34] tests: Stop endpoints in parallel (#12769) Shaves off a few seconds from tests involving multiple endpoints. --- test_runner/fixtures/neon_fixtures.py | 28 +++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7f59547c73..b5148cbfdc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -5280,16 +5280,32 @@ class EndpointFactory: ) def stop_all(self, fail_on_error=True) -> Self: - exception = None - for ep in self.endpoints: + """ + Stop all the endpoints in parallel. + """ + + # Note: raising an exception from a task in a task group cancels + # all the other tasks. We don't want that, hence the 'stop_one' + # function catches exceptions and puts them on the 'exceptions' + # list for later processing. + exceptions = [] + + async def stop_one(ep): try: - ep.stop() + await asyncio.to_thread(ep.stop) except Exception as e: log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}") - exception = e + exceptions.append(e) - if fail_on_error and exception is not None: - raise exception + async def async_stop_all(): + async with asyncio.TaskGroup() as tg: + for ep in self.endpoints: + tg.create_task(stop_one(ep)) + + asyncio.run(async_stop_all()) + + if fail_on_error and exceptions: + raise ExceptionGroup("stopping an endpoint failed", exceptions) return self From e989e0da78baf77d16644099a0f648675160d58d Mon Sep 17 00:00:00 2001 From: Ruslan Talpa Date: Wed, 30 Jul 2025 17:17:51 +0300 Subject: [PATCH 22/34] [proxy] accept jwts when configured as rest_broker (#12777) ## Problem when compiled with rest_broker feature and is_rest_broker=true (but is_auth_broker=false) accept_jwts is set to false ## Summary of changes set the config with ``` accept_jwts: args.is_auth_broker || args.is_rest_broker ``` Co-authored-by: Ruslan Talpa --- proxy/src/binary/proxy.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 29b0ad53f2..583cdc95bf 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -700,7 +700,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { ip_allowlist_check_enabled: !args.is_private_access_proxy, is_vpc_acccess_proxy: args.is_private_access_proxy, is_auth_broker: args.is_auth_broker, + #[cfg(not(feature = "rest_broker"))] accept_jwts: args.is_auth_broker, + #[cfg(feature = "rest_broker")] + accept_jwts: args.is_auth_broker || args.is_rest_broker, console_redirect_confirmation_timeout: args.webauth_confirmation_timeout, }; From 056056bef0d6ec6858d59e4f872bd51091daa93b Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Wed, 30 Jul 2025 10:33:19 -0400 Subject: [PATCH 23/34] fix(compute): validate `prewarm_local_cache()` input (#12648) ## Problem ``` postgres=> select neon.prewarm_local_cache('\xfcfcfcfc01000000ffffffff070000000000000000000000000000000000000000000000000000000000000000000000000000ff', 1); WARNING: terminating connection because of crash of another server process DETAIL: The postmaster has commanded this server process to roll back the current transaction and exit, because another server process exited abnormally and possibly corrupted shared memory. HINT: In a moment you should be able to reconnect to the database and repeat your command. FATAL: server conn crashed? ``` The function takes a bytea argument and casts it to a C struct, without validating the contents. ## Summary of changes Added validation for number of pages to be prefetched and for the chunks as well. --- pgxn/neon/file_cache.c | 29 ++++++++++++++++++++++++++--- pgxn/neon/neon_utils.c | 19 +++++++++++++++++++ pgxn/neon/neon_utils.h | 4 ++++ 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 3c680eab86..bb810ada68 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -49,6 +49,7 @@ #include "neon.h" #include "neon_lwlsncache.h" #include "neon_perf_counters.h" +#include "neon_utils.h" #include "pagestore_client.h" #include "communicator.h" @@ -673,8 +674,19 @@ lfc_get_state(size_t max_entries) { if (GET_STATE(entry, j) != UNAVAILABLE) { - BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j); - n_pages += 1; + /* Validate the buffer tag before including it */ + BufferTag test_tag = entry->key; + test_tag.blockNum += j; + + if (BufferTagIsValid(&test_tag)) + { + BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j); + n_pages += 1; + } + else + { + elog(ERROR, "LFC: Skipping invalid buffer tag during cache state capture: blockNum=%u", test_tag.blockNum); + } } } if (++i == n_entries) @@ -683,7 +695,7 @@ lfc_get_state(size_t max_entries) Assert(i == n_entries); fcs->n_pages = n_pages; Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages); - elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages); + elog(LOG, "LFC: save state of %d chunks %d pages (validated)", (int)n_entries, (int)n_pages); } LWLockRelease(lfc_lock); @@ -702,6 +714,7 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers) size_t n_entries; size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size); size_t fcs_size; + uint32_t max_prefetch_pages; dsm_segment *seg; BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS]; @@ -746,6 +759,11 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers) n_entries = Min(fcs->n_chunks, lfc_prewarm_limit); Assert(n_entries != 0); + max_prefetch_pages = n_entries << fcs_chunk_size_log; + if (fcs->n_pages > max_prefetch_pages) { + elog(ERROR, "LFC: Number of pages in file cache state (%d) is more than the limit (%d)", fcs->n_pages, max_prefetch_pages); + } + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); /* Do not prewarm more entries than LFC limit */ @@ -898,6 +916,11 @@ lfc_prewarm_main(Datum main_arg) { tag = fcs->chunks[snd_idx >> fcs_chunk_size_log]; tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1); + + if (!BufferTagIsValid(&tag)) { + elog(ERROR, "LFC: Invalid buffer tag: %u", tag.blockNum); + } + if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum)) { (void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 1fad44bd58..847d380eb3 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -183,3 +183,22 @@ alloc_curl_handle(void) } #endif + +/* + * Check if a BufferTag is valid by verifying all its fields are not invalid. + */ +bool +BufferTagIsValid(const BufferTag *tag) +{ + #if PG_MAJORVERSION_NUM >= 16 + return (tag->spcOid != InvalidOid) && + (tag->relNumber != InvalidRelFileNumber) && + (tag->forkNum != InvalidForkNumber) && + (tag->blockNum != InvalidBlockNumber); + #else + return (tag->rnode.spcNode != InvalidOid) && + (tag->rnode.relNode != InvalidOid) && + (tag->forkNum != InvalidForkNumber) && + (tag->blockNum != InvalidBlockNumber); + #endif +} diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index 7480ac28cc..65d280788d 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -2,6 +2,7 @@ #define __NEON_UTILS_H__ #include "lib/stringinfo.h" +#include "storage/buf_internals.h" #ifndef WALPROPOSER_LIB #include @@ -16,6 +17,9 @@ void pq_sendint32_le(StringInfo buf, uint32 i); void pq_sendint64_le(StringInfo buf, uint64 i); void disable_core_dump(void); +/* Buffer tag validation function */ +bool BufferTagIsValid(const BufferTag *tag); + #ifndef WALPROPOSER_LIB CURL * alloc_curl_handle(void); From 842a5091d5db4c23aeb29aea070c37ad06b12d63 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Wed, 30 Jul 2025 11:14:59 -0400 Subject: [PATCH 24/34] [BRC-3051] Walproposer: Safekeeper quorum health metrics (#930) (#12750) Today we don't have any indications (other than spammy logs in PG that nobody monitors) if the Walproposer in PG cannot connect to/get votes from all Safekeepers. This means we don't have signals indicating that the Safekeepers are operating at degraded redundancy. We need these signals. Added plumbing in PG extension so that the `neon_perf_counters` view exports the following gauge metrics on safekeeper health: - `num_configured_safekeepers`: The total number of safekeepers configured in PG. - `num_active_safekeepers`: The number of safekeepers that PG is actively streaming WAL to. An alert should be raised whenever `num_active_safekeepers` < `num_configured_safekeepers`. The metrics are implemented by adding additional state to the Walproposer shared memory keeping track of the active statuses of safekeepers using a simple array. The status of the safekeeper is set to active (1) after the Walproposer acquires a quorum and starts streaming data to the safekeeper, and is set to inactive (0) when the connection with a safekeeper is shut down. We scan the safekeeper status array in Walproposer shared memory when collecting the metrics to produce results for the gauges. Added coverage for the metrics to integration test `test_wal_acceptor.py::test_timeline_disk_usage_limit`. ## Problem ## Summary of changes --------- Co-authored-by: William Huang --- libs/walproposer/src/api_bindings.rs | 34 ++++++++++++++++++++++++ libs/walproposer/src/walproposer.rs | 15 +++++++++++ pgxn/neon/neon_perf_counters.c | 27 +++++++++++++++++++ pgxn/neon/walproposer.c | 16 ++++++++++- pgxn/neon/walproposer.h | 26 ++++++++++++++++++ pgxn/neon/walproposer_pg.c | 23 ++++++++++++++++ test_runner/regress/test_wal_acceptor.py | 33 ++++++++++++++++++++--- 7 files changed, 170 insertions(+), 4 deletions(-) diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 9f88ea6b11..9c90beb379 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -341,6 +341,34 @@ extern "C-unwind" fn log_internal( } } +/* BEGIN_HADRON */ +extern "C" fn reset_safekeeper_statuses_for_metrics(wp: *mut WalProposer, num_safekeepers: u32) { + unsafe { + let callback_data = (*(*wp).config).callback_data; + let api = callback_data as *mut Box; + if api.is_null() { + return; + } + (*api).reset_safekeeper_statuses_for_metrics(&mut (*wp), num_safekeepers); + } +} + +extern "C" fn update_safekeeper_status_for_metrics( + wp: *mut WalProposer, + sk_index: u32, + status: u8, +) { + unsafe { + let callback_data = (*(*wp).config).callback_data; + let api = callback_data as *mut Box; + if api.is_null() { + return; + } + (*api).update_safekeeper_status_for_metrics(&mut (*wp), sk_index, status); + } +} +/* END_HADRON */ + #[derive(Debug, PartialEq)] pub enum Level { Debug5, @@ -414,6 +442,10 @@ pub(crate) fn create_api() -> walproposer_api { finish_sync_safekeepers: Some(finish_sync_safekeepers), process_safekeeper_feedback: Some(process_safekeeper_feedback), log_internal: Some(log_internal), + /* BEGIN_HADRON */ + reset_safekeeper_statuses_for_metrics: Some(reset_safekeeper_statuses_for_metrics), + update_safekeeper_status_for_metrics: Some(update_safekeeper_status_for_metrics), + /* END_HADRON */ } } @@ -451,6 +483,8 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { replica_promote: false, min_ps_feedback: empty_feedback, wal_rate_limiter: empty_wal_rate_limiter, + num_safekeepers: 0, + safekeeper_status: [0; 32], } } diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 93bb0d5eb0..8453279c5c 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -159,6 +159,21 @@ pub trait ApiImpl { fn after_election(&self, _wp: &mut WalProposer) { todo!() } + + /* BEGIN_HADRON */ + fn reset_safekeeper_statuses_for_metrics(&self, _wp: &mut WalProposer, _num_safekeepers: u32) { + // Do nothing for testing purposes. + } + + fn update_safekeeper_status_for_metrics( + &self, + _wp: &mut WalProposer, + _sk_index: u32, + _status: u8, + ) { + // Do nothing for testing purposes. + } + /* END_HADRON */ } #[derive(Debug)] diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index fada4cba1e..4527084514 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -391,6 +391,12 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) neon_per_backend_counters totals = {0}; metric_t *metrics; + /* BEGIN_HADRON */ + WalproposerShmemState *wp_shmem; + uint32 num_safekeepers; + uint32 num_active_safekeepers; + /* END_HADRON */ + /* We put all the tuples into a tuplestore in one go. */ InitMaterializedSRF(fcinfo, 0); @@ -437,11 +443,32 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) // Not ideal but piggyback our databricks counters into the neon perf counters view // so that we don't need to introduce neon--1.x+1.sql to add a new view. { + // Keeping this code in its own block to work around the C90 "don't mix declarations and code" rule when we define + // the `databricks_metrics` array in the next block. Yes, we are seriously dealing with C90 rules in 2025. + + // Read safekeeper status from wal proposer shared memory first. + // Note that we are taking a mutex when reading from walproposer shared memory so that the total safekeeper count is + // consistent with the active wal acceptors count. Assuming that we don't query this view too often the mutex should + // not be a huge deal. + wp_shmem = GetWalpropShmemState(); + SpinLockAcquire(&wp_shmem->mutex); + num_safekeepers = wp_shmem->num_safekeepers; + num_active_safekeepers = 0; + for (int i = 0; i < num_safekeepers; i++) { + if (wp_shmem->safekeeper_status[i] == 1) { + num_active_safekeepers++; + } + } + SpinLockRelease(&wp_shmem->mutex); + } + { metric_t databricks_metrics[] = { {"sql_index_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->index_corruption_count)}, {"sql_data_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->data_corruption_count)}, {"sql_internal_error_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->internal_error_count)}, {"ps_corruption_detected", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->ps_corruption_detected)}, + {"num_active_safekeepers", false, 0.0, (double) num_active_safekeepers}, + {"num_configured_safekeepers", false, 0.0, (double) num_safekeepers}, {NULL, false, 0, 0}, }; for (int i = 0; databricks_metrics[i].name != NULL; i++) diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index c85a6f4b6f..dd42eaf18e 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -154,7 +154,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE; wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND; wp->safekeeper[wp->n_safekeepers].wp = wp; - + /* BEGIN_HADRON */ + wp->safekeeper[wp->n_safekeepers].index = wp->n_safekeepers; + /* END_HADRON */ { Safekeeper *sk = &wp->safekeeper[wp->n_safekeepers]; int written = 0; @@ -183,6 +185,10 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3) wp_log(FATAL, "enabling generations requires protocol version 3"); wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version); + + /* BEGIN_HADRON */ + wp->api.reset_safekeeper_statuses_for_metrics(wp, wp->n_safekeepers); + /* END_HADRON */ /* Fill the greeting package */ wp->greetRequest.pam.tag = 'g'; @@ -355,6 +361,10 @@ ShutdownConnection(Safekeeper *sk) sk->state = SS_OFFLINE; sk->streamingAt = InvalidXLogRecPtr; + /* BEGIN_HADRON */ + sk->wp->api.update_safekeeper_status_for_metrics(sk->wp, sk->index, 0); + /* END_HADRON */ + MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); @@ -1530,6 +1540,10 @@ StartStreaming(Safekeeper *sk) sk->active_state = SS_ACTIVE_SEND; sk->streamingAt = sk->startStreamingAt; + /* BEGIN_HADRON */ + sk->wp->api.update_safekeeper_status_for_metrics(sk->wp, sk->index, 1); + /* END_HADRON */ + /* * Donors can only be in SS_ACTIVE state, so we potentially update the * donor when we switch one to SS_ACTIVE. diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index d6cd532bec..ac42c2925d 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -432,6 +432,10 @@ typedef struct WalproposerShmemState /* BEGIN_HADRON */ /* The WAL rate limiter */ WalRateLimiter wal_rate_limiter; + /* Number of safekeepers in the config */ + uint32 num_safekeepers; + /* Per-safekeeper status flags: 0=inactive, 1=active */ + uint8 safekeeper_status[MAX_SAFEKEEPERS]; /* END_HADRON */ } WalproposerShmemState; @@ -483,6 +487,11 @@ typedef struct Safekeeper char const *host; char const *port; + /* BEGIN_HADRON */ + /* index of this safekeeper in the WalProposer array */ + uint32 index; + /* END_HADRON */ + /* * connection string for connecting/reconnecting. * @@ -731,6 +740,23 @@ typedef struct walproposer_api * handled by elog(). */ void (*log_internal) (WalProposer *wp, int level, const char *line); + + /* + * BEGIN_HADRON + * APIs manipulating shared memory state used for Safekeeper quorum health metrics. + */ + + /* + * Reset the safekeeper statuses in shared memory for metric purposes. + */ + void (*reset_safekeeper_statuses_for_metrics) (WalProposer *wp, uint32 num_safekeepers); + + /* + * Update the safekeeper status in shared memory for metric purposes. + */ + void (*update_safekeeper_status_for_metrics) (WalProposer *wp, uint32 sk_index, uint8 status); + + /* END_HADRON */ } walproposer_api; /* diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index da86c5d498..47b5ec523f 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -2261,6 +2261,27 @@ GetNeonCurrentClusterSize(void) } uint64 GetNeonCurrentClusterSize(void); +/* BEGIN_HADRON */ +static void +walprop_pg_reset_safekeeper_statuses_for_metrics(WalProposer *wp, uint32 num_safekeepers) +{ + WalproposerShmemState* shmem = wp->api.get_shmem_state(wp); + SpinLockAcquire(&shmem->mutex); + shmem->num_safekeepers = num_safekeepers; + memset(shmem->safekeeper_status, 0, sizeof(shmem->safekeeper_status)); + SpinLockRelease(&shmem->mutex); +} + +static void +walprop_pg_update_safekeeper_status_for_metrics(WalProposer *wp, uint32 sk_index, uint8 status) +{ + WalproposerShmemState* shmem = wp->api.get_shmem_state(wp); + Assert(sk_index < MAX_SAFEKEEPERS); + SpinLockAcquire(&shmem->mutex); + shmem->safekeeper_status[sk_index] = status; + SpinLockRelease(&shmem->mutex); +} +/* END_HADRON */ static const walproposer_api walprop_pg = { .get_shmem_state = walprop_pg_get_shmem_state, @@ -2294,4 +2315,6 @@ static const walproposer_api walprop_pg = { .finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers, .process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback, .log_internal = walprop_pg_log_internal, + .reset_safekeeper_statuses_for_metrics = walprop_pg_reset_safekeeper_statuses_for_metrics, + .update_safekeeper_status_for_metrics = walprop_pg_update_safekeeper_status_for_metrics, }; diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index c691087259..33d308fb5a 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2742,6 +2742,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde wait_until(unevicted) +@pytest.mark.skip(reason="Lakebase mode") def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): """ Test that the timeline disk usage circuit breaker works as expected. We test that: @@ -2757,18 +2758,32 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): remote_storage_kind = s3_storage() neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) - # Set a very small disk usage limit (1KB) - neon_env_builder.safekeeper_extra_opts = ["--max-timeline-disk-usage-bytes=1024"] - env = neon_env_builder.init_start() # Create a timeline and endpoint env.create_branch("test_timeline_disk_usage_limit") endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit") + # Install the neon extension in the test database. We need it to query perf counter metrics. + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE EXTENSION IF NOT EXISTS neon") + # Sanity-check safekeeper connection status in neon_perf_counters in the happy case. + cur.execute( + "SELECT value FROM neon_perf_counters WHERE metric = 'num_active_safekeepers'" + ) + assert cur.fetchone() == (1,), "Expected 1 active safekeeper" + cur.execute( + "SELECT value FROM neon_perf_counters WHERE metric = 'num_configured_safekeepers'" + ) + assert cur.fetchone() == (1,), "Expected 1 configured safekeeper" + # Get the safekeeper sk = env.safekeepers[0] + # Restart the safekeeper with a very small disk usage limit (1KB) + sk.stop().start(["--max-timeline-disk-usage-bytes=1024"]) + # Inject a failpoint to stop WAL backup with sk.http_client() as http_cli: http_cli.configure_failpoints([("backup-lsn-range-pausable", "pause")]) @@ -2794,6 +2809,18 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): wait_until(error_logged) log.info("Found expected error message in compute log, resuming.") + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + # Confirm that neon_perf_counters also indicates that there are no active safekeepers + cur.execute( + "SELECT value FROM neon_perf_counters WHERE metric = 'num_active_safekeepers'" + ) + assert cur.fetchone() == (0,), "Expected 0 active safekeepers" + cur.execute( + "SELECT value FROM neon_perf_counters WHERE metric = 'num_configured_safekeepers'" + ) + assert cur.fetchone() == (1,), "Expected 1 configured safekeeper" + # Sanity check that the hanging insert is indeed still hanging. Otherwise means the circuit breaker we # implemented didn't work as expected. time.sleep(2) From f3a0e4f255631b3c2039265abc465282d9057c4e Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 30 Jul 2025 17:29:16 +0200 Subject: [PATCH 25/34] Improve specificity with which we apply compute specs (#12773) This makes sure we don't confuse user-controlled functions with PG's builtin functions. ## Problem See https://github.com/neondatabase/cloud/issues/31628 --- .../etc/sql_exporter/checkpoints_req.17.sql | 2 +- compute/etc/sql_exporter/checkpoints_req.sql | 2 +- .../etc/sql_exporter/checkpoints_timed.sql | 2 +- ..._backpressure_throttling_seconds_total.sql | 2 +- .../etc/sql_exporter/compute_current_lsn.sql | 4 +-- .../compute_logical_snapshot_files.sql | 4 +-- .../compute_logical_snapshots_bytes.15.sql | 4 +-- .../compute_logical_snapshots_bytes.sql | 6 ++--- .../sql_exporter/compute_max_connections.sql | 2 +- .../compute_pg_oldest_frozen_xid_age.sql | 4 +-- .../compute_pg_oldest_mxid_age.sql | 4 +-- .../etc/sql_exporter/compute_receive_lsn.sql | 2 +- .../compute_subscriptions_count.sql | 2 +- .../etc/sql_exporter/connection_counts.sql | 2 +- compute/etc/sql_exporter/db_total_size.sql | 4 +-- ...e_working_set_size_windows.autoscaling.sql | 2 +- ...c_approximate_working_set_size_windows.sql | 2 +- .../etc/sql_exporter/lfc_cache_size_limit.sql | 2 +- .../sql_exporter/logical_slot_restart_lsn.sql | 4 +-- compute/etc/sql_exporter/max_cluster_size.sql | 2 +- compute/etc/sql_exporter/pg_stats_userdb.sql | 4 +-- .../sql_exporter/replication_delay_bytes.sql | 2 +- .../replication_delay_seconds.sql | 4 +-- compute/etc/sql_exporter/retained_wal.sql | 10 +++---- compute/etc/sql_exporter/wal_is_lost.sql | 2 +- compute_tools/src/bin/compute_ctl.rs | 2 +- compute_tools/src/checker.rs | 4 +-- compute_tools/src/compute.rs | 27 ++++++++++--------- compute_tools/src/compute_promote.rs | 6 ++--- compute_tools/src/installed_extensions.rs | 4 +-- compute_tools/src/migration.rs | 2 +- .../src/migrations/0002-alter_roles.sql | 14 +++++----- ...create_subscription_to_privileged_role.sql | 2 +- ...plication_for_previously_allowed_roles.sql | 6 ++--- ...nchronization_funcs_to_privileged_role.sql | 2 +- ...0001-add_bypass_rls_to_privileged_role.sql | 2 +- .../src/migrations/tests/0002-alter_roles.sql | 12 ++++----- ...create_subscription_to_privileged_role.sql | 4 +-- ...04-grant_pg_monitor_to_privileged_role.sql | 8 +++--- ...nchronization_funcs_to_privileged_role.sql | 6 ++--- ...ation_origin_status_to_privileged_role.sql | 4 +-- ...t_pg_signal_backend_to_privileged_role.sql | 4 +-- compute_tools/src/monitor.rs | 14 +++++----- compute_tools/src/pg_helpers.rs | 4 +-- compute_tools/src/spec_apply.rs | 14 ++++++---- .../src/sql/add_availabilitycheck_tables.sql | 13 ++++----- .../src/sql/anon_ext_fn_reassign.sql | 9 ++++--- .../src/sql/create_privileged_role.sql | 2 +- compute_tools/src/sql/default_grants.sql | 8 +++--- compute_tools/src/sql/drop_subscriptions.sql | 16 +++++++---- .../src/sql/finalize_drop_subscriptions.sql | 10 +++---- .../sql/pre_drop_role_revoke_privileges.sql | 8 +++--- .../src/sql/set_public_schema_owner.sql | 8 +++--- .../src/sql/unset_template_for_drop_dbs.sql | 4 +-- 54 files changed, 156 insertions(+), 143 deletions(-) diff --git a/compute/etc/sql_exporter/checkpoints_req.17.sql b/compute/etc/sql_exporter/checkpoints_req.17.sql index a4b946e8e2..28c868ff72 100644 --- a/compute/etc/sql_exporter/checkpoints_req.17.sql +++ b/compute/etc/sql_exporter/checkpoints_req.17.sql @@ -1 +1 @@ -SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer; +SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_req.sql b/compute/etc/sql_exporter/checkpoints_req.sql index eb8427c883..421448c0de 100644 --- a/compute/etc/sql_exporter/checkpoints_req.sql +++ b/compute/etc/sql_exporter/checkpoints_req.sql @@ -1 +1 @@ -SELECT checkpoints_req FROM pg_stat_bgwriter; +SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/checkpoints_timed.sql b/compute/etc/sql_exporter/checkpoints_timed.sql index c50853134c..bfa9b1b3d6 100644 --- a/compute/etc/sql_exporter/checkpoints_timed.sql +++ b/compute/etc/sql_exporter/checkpoints_timed.sql @@ -1 +1 @@ -SELECT checkpoints_timed FROM pg_stat_bgwriter; +SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql index d97d625d4c..3fe638e489 100644 --- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql +++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql @@ -1 +1 @@ -SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled; +SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled; diff --git a/compute/etc/sql_exporter/compute_current_lsn.sql b/compute/etc/sql_exporter/compute_current_lsn.sql index be02b8a094..9a042547f0 100644 --- a/compute/etc/sql_exporter/compute_current_lsn.sql +++ b/compute/etc/sql_exporter/compute_current_lsn.sql @@ -1,4 +1,4 @@ SELECT CASE - WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 - ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8 + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_replay_lsn() - '0/0')::pg_catalog.FLOAT8 + ELSE (pg_catalog.pg_current_wal_lsn() - '0/0')::pg_catalog.FLOAT8 END AS lsn; diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql index f2454235b7..2224c02d8d 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql @@ -1,7 +1,7 @@ SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; + (SELECT COUNT(*) FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql index 73a9c11405..17cf1228d3 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql @@ -1,7 +1,7 @@ SELECT - (SELECT current_setting('neon.timeline_id')) AS timeline_id, + (SELECT pg_catalog.current_setting('neon.timeline_id')) AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes; + (SELECT COALESCE(pg_catalog.sum(size), 0) FROM pg_catalog.pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes; diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql index 16da899de2..33ca1137dc 100644 --- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql +++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql @@ -1,9 +1,9 @@ SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. -- These temporary snapshot files are renamed to the actual snapshot files -- after they are completely built. We only WAL-log the completely built -- snapshot files - (SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0) - FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name + (SELECT COALESCE(pg_catalog.sum((pg_catalog.pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0) + FROM (SELECT * FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name ) AS logical_snapshots_bytes; diff --git a/compute/etc/sql_exporter/compute_max_connections.sql b/compute/etc/sql_exporter/compute_max_connections.sql index 99a49483a6..1613c962a2 100644 --- a/compute/etc/sql_exporter/compute_max_connections.sql +++ b/compute/etc/sql_exporter/compute_max_connections.sql @@ -1 +1 @@ -SELECT current_setting('max_connections') as max_connections; +SELECT pg_catalog.current_setting('max_connections') AS max_connections; diff --git a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql index d2281fdd42..e613939e71 100644 --- a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql +++ b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql @@ -1,4 +1,4 @@ SELECT datname database_name, - age(datfrozenxid) frozen_xid_age -FROM pg_database + pg_catalog.age(datfrozenxid) frozen_xid_age +FROM pg_catalog.pg_database ORDER BY frozen_xid_age DESC LIMIT 10; diff --git a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql index ed57894b3a..7949bacfff 100644 --- a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql +++ b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql @@ -1,4 +1,4 @@ SELECT datname database_name, - mxid_age(datminmxid) min_mxid_age -FROM pg_database + pg_catalog.mxid_age(datminmxid) min_mxid_age +FROM pg_catalog.pg_database ORDER BY min_mxid_age DESC LIMIT 10; diff --git a/compute/etc/sql_exporter/compute_receive_lsn.sql b/compute/etc/sql_exporter/compute_receive_lsn.sql index 318b31ab41..fb96056881 100644 --- a/compute/etc/sql_exporter/compute_receive_lsn.sql +++ b/compute/etc/sql_exporter/compute_receive_lsn.sql @@ -1,4 +1,4 @@ SELECT CASE - WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_receive_lsn() - '0/0')::pg_catalog.FLOAT8 ELSE 0 END AS lsn; diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.sql b/compute/etc/sql_exporter/compute_subscriptions_count.sql index 50740cb5df..e380a7acc7 100644 --- a/compute/etc/sql_exporter/compute_subscriptions_count.sql +++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql @@ -1 +1 @@ -SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled; +SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled; diff --git a/compute/etc/sql_exporter/connection_counts.sql b/compute/etc/sql_exporter/connection_counts.sql index 6824480fdb..480c4fb439 100644 --- a/compute/etc/sql_exporter/connection_counts.sql +++ b/compute/etc/sql_exporter/connection_counts.sql @@ -1 +1 @@ -SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state; +SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state; diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql index fe0360ab5c..59205e6ed3 100644 --- a/compute/etc/sql_exporter/db_total_size.sql +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -1,5 +1,5 @@ -SELECT sum(pg_database_size(datname)) AS total -FROM pg_database +SELECT pg_catalog.sum(pg_catalog.pg_database_size(datname)) AS total +FROM pg_catalog.pg_database -- Ignore invalid databases, as we will likely have problems with -- getting their size from the Pageserver. WHERE datconnlimit != -2; diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql index 35fa42c34c..02cb2b4649 100644 --- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql @@ -3,6 +3,6 @@ -- minutes. SELECT - x::text as duration_seconds, + x::pg_catalog.text AS duration_seconds, neon.approximate_working_set_size_seconds(x) AS size FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql index 46c7d1610c..aab93d433a 100644 --- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql @@ -3,6 +3,6 @@ SELECT x AS duration, - neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM ( + neon.approximate_working_set_size_seconds(extract('epoch' FROM x::pg_catalog.interval)::pg_catalog.int4) AS size FROM ( VALUES ('5m'), ('15m'), ('1h') ) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.sql b/compute/etc/sql_exporter/lfc_cache_size_limit.sql index 378904c1fe..41c11e0adc 100644 --- a/compute/etc/sql_exporter/lfc_cache_size_limit.sql +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql @@ -1 +1 @@ -SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; +SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql index 1b1c038501..8964d0d8ff 100644 --- a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql @@ -1,3 +1,3 @@ -SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn -FROM pg_replication_slots +SELECT slot_name, (restart_lsn - '0/0')::pg_catalog.FLOAT8 AS restart_lsn +FROM pg_catalog.pg_replication_slots WHERE slot_type = 'logical'; diff --git a/compute/etc/sql_exporter/max_cluster_size.sql b/compute/etc/sql_exporter/max_cluster_size.sql index 2d2355a9a7..d44fdebe38 100644 --- a/compute/etc/sql_exporter/max_cluster_size.sql +++ b/compute/etc/sql_exporter/max_cluster_size.sql @@ -1 +1 @@ -SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size'; +SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size'; diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql index 12e6c4ae59..1a1e54a7c6 100644 --- a/compute/etc/sql_exporter/pg_stats_userdb.sql +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -1,13 +1,13 @@ -- We export stats for 10 non-system databases. Without this limit it is too -- easy to abuse the system by creating lots of databases. -SELECT pg_database_size(datname) AS db_size, +SELECT pg_catalog.pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, tup_updated AS updated, tup_deleted AS deleted, datname -FROM pg_stat_database +FROM pg_catalog.pg_stat_database WHERE datname IN ( SELECT datname FROM pg_database -- Ignore invalid databases, as we will likely have problems with diff --git a/compute/etc/sql_exporter/replication_delay_bytes.sql b/compute/etc/sql_exporter/replication_delay_bytes.sql index 60a6981acd..d3b7aa724b 100644 --- a/compute/etc/sql_exporter/replication_delay_bytes.sql +++ b/compute/etc/sql_exporter/replication_delay_bytes.sql @@ -3,4 +3,4 @@ -- replay LSN may have advanced past the receive LSN we are using for the -- calculation. -SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; +SELECT GREATEST(0, pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), pg_catalog.pg_last_wal_replay_lsn())) AS replication_delay_bytes; diff --git a/compute/etc/sql_exporter/replication_delay_seconds.sql b/compute/etc/sql_exporter/replication_delay_seconds.sql index a76809ad74..af4dd3fd90 100644 --- a/compute/etc/sql_exporter/replication_delay_seconds.sql +++ b/compute/etc/sql_exporter/replication_delay_seconds.sql @@ -1,5 +1,5 @@ SELECT CASE - WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 - ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + WHEN pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST(0, EXTRACT (EPOCH FROM pg_catalog.now() - pg_catalog.pg_last_xact_replay_timestamp())) END AS replication_delay_seconds; diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql index 3e2aadfc28..ccb3504d58 100644 --- a/compute/etc/sql_exporter/retained_wal.sql +++ b/compute/etc/sql_exporter/retained_wal.sql @@ -1,10 +1,10 @@ SELECT slot_name, - pg_wal_lsn_diff( + pg_catalog.pg_wal_lsn_diff( CASE - WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() - ELSE pg_current_wal_lsn() + WHEN pg_catalog.pg_is_in_recovery() THEN pg_catalog.pg_last_wal_replay_lsn() + ELSE pg_catalog.pg_current_wal_lsn() END, - restart_lsn)::FLOAT8 AS retained_wal -FROM pg_replication_slots + restart_lsn)::pg_catalog.FLOAT8 AS retained_wal +FROM pg_catalog.pg_replication_slots WHERE active = false; diff --git a/compute/etc/sql_exporter/wal_is_lost.sql b/compute/etc/sql_exporter/wal_is_lost.sql index 5521270851..5a94cc3373 100644 --- a/compute/etc/sql_exporter/wal_is_lost.sql +++ b/compute/etc/sql_exporter/wal_is_lost.sql @@ -4,4 +4,4 @@ SELECT WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost -FROM pg_replication_slots; +FROM pg_catalog.pg_replication_slots; diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 2b4802f309..f383683ef8 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -279,7 +279,7 @@ fn main() -> Result<()> { config, )?; - let exit_code = compute_node.run()?; + let exit_code = compute_node.run().context("running compute node")?; scenario.teardown(); diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index e4207876ac..2458fe3c11 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -24,9 +24,9 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> { }); let query = " - INSERT INTO health_check VALUES (1, now()) + INSERT INTO public.health_check VALUES (1, pg_catalog.now()) ON CONFLICT (id) DO UPDATE - SET updated_at = now();"; + SET updated_at = pg_catalog.now();"; match client.simple_query(query).await { Result::Ok(result) => { diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1033670e2b..f553bf3c1e 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -583,7 +583,7 @@ impl ComputeNode { // that can affect `compute_ctl` and prevent it from properly configuring the database schema. // Unset them via connection string options before connecting to the database. // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. - const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none"; + const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0 -c pgaudit.log=none"; let options = match conn_conf.get_options() { // Allow the control plane to override any options set by the // compute @@ -1884,7 +1884,7 @@ impl ComputeNode { // It doesn't matter what were the options before, here we just want // to connect and create a new superuser role. - const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0"; zenith_admin_conf.options(ZENITH_OPTIONS); let mut client = @@ -2339,13 +2339,13 @@ impl ComputeNode { let result = client .simple_query( "SELECT - row_to_json(pg_stat_statements) + pg_catalog.row_to_json(pss) FROM - pg_stat_statements + public.pg_stat_statements pss WHERE - userid != 'cloud_admin'::regrole::oid + pss.userid != 'cloud_admin'::pg_catalog.regrole::pg_catalog.oid ORDER BY - (mean_exec_time + mean_plan_time) DESC + (pss.mean_exec_time + pss.mean_plan_time) DESC LIMIT 100", ) .await; @@ -2473,11 +2473,11 @@ LIMIT 100", // check the role grants first - to gracefully handle read-replicas. let select = "SELECT privilege_type - FROM pg_namespace - JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true - JOIN pg_user users ON acl.grantee = users.usesysid - WHERE users.usename = $1 - AND nspname = $2"; + FROM pg_catalog.pg_namespace + JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) AS acl ON true + JOIN pg_catalog.pg_user users ON acl.grantee = users.usesysid + WHERE users.usename OPERATOR(pg_catalog.=) $1::pg_catalog.name + AND nspname OPERATOR(pg_catalog.=) $2::pg_catalog.name"; let rows = db_client .query(select, &[role_name, schema_name]) .await @@ -2546,8 +2546,9 @@ LIMIT 100", .await .with_context(|| format!("Failed to execute query: {query}"))?; } else { - let query = - format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}"); + let query = format!( + "CREATE EXTENSION IF NOT EXISTS {ext_name} WITH SCHEMA public VERSION {quoted_version}" + ); db_client .simple_query(&query) .await diff --git a/compute_tools/src/compute_promote.rs b/compute_tools/src/compute_promote.rs index a34368c531..29195b60e9 100644 --- a/compute_tools/src/compute_promote.rs +++ b/compute_tools/src/compute_promote.rs @@ -78,7 +78,7 @@ impl ComputeNode { const RETRIES: i32 = 20; for i in 0..=RETRIES { let row = client - .query_one("SELECT pg_last_wal_replay_lsn()", &[]) + .query_one("SELECT pg_catalog.pg_last_wal_replay_lsn()", &[]) .await .context("getting last replay lsn")?; let lsn: u64 = row.get::(0).into(); @@ -103,7 +103,7 @@ impl ComputeNode { .await .context("setting safekeepers")?; client - .query("SELECT pg_reload_conf()", &[]) + .query("SELECT pg_catalog.pg_reload_conf()", &[]) .await .context("reloading postgres config")?; @@ -113,7 +113,7 @@ impl ComputeNode { }); let row = client - .query_one("SELECT * FROM pg_promote()", &[]) + .query_one("SELECT * FROM pg_catalog.pg_promote()", &[]) .await .context("pg_promote")?; if !row.get::(0) { diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 5f60b711c8..a9ddef58e5 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -19,7 +19,7 @@ async fn list_dbs(client: &mut Client) -> Result, PostgresError> { .query( "SELECT datname FROM pg_catalog.pg_database WHERE datallowconn - AND datconnlimit <> - 2 + AND datconnlimit OPERATOR(pg_catalog.<>) (OPERATOR(pg_catalog.-) 2::pg_catalog.int4) LIMIT 500", &[], ) @@ -67,7 +67,7 @@ pub async fn get_installed_extensions( let extensions: Vec<(String, String, i32)> = client .query( - "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension", + "SELECT extname, extversion, extowner::pg_catalog.int4 FROM pg_catalog.pg_extension", &[], ) .await? diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 88d870df97..c8f911b5a6 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -76,7 +76,7 @@ impl<'m> MigrationRunner<'m> { self.client .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration") .await?; - self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?; + self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key pg_catalog.int4 NOT NULL PRIMARY KEY, id pg_catalog.int8 NOT NULL DEFAULT 0)").await?; self.client .simple_query( "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING", diff --git a/compute_tools/src/migrations/0002-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql index 367356e6eb..6e28e8d32c 100644 --- a/compute_tools/src/migrations/0002-alter_roles.sql +++ b/compute_tools/src/migrations/0002-alter_roles.sql @@ -15,17 +15,17 @@ DO $$ DECLARE role_name text; BEGIN - FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member') + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT'; + RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I INHERIT;', role_name); END LOOP; - FOR role_name IN SELECT rolname FROM pg_roles + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE - NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_') + NOT pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_') LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS'; + RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I NOBYPASSRLS;', role_name); END LOOP; END $$; diff --git a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql index adf159dc06..d67d6457c6 100644 --- a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql +++ b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name = 'server_version_num') THEN EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}'; END IF; END $$; diff --git a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql index 47129d65b8..7f74d4ee28 100644 --- a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql +++ b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql @@ -5,9 +5,9 @@ DO $$ DECLARE role_name TEXT; BEGIN - FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE + FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE rolreplication IS TRUE LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION'; + RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', pg_catalog.quote_ident(role_name); + EXECUTE pg_catalog.format('ALTER ROLE %I NOREPLICATION;', role_name); END LOOP; END $$; diff --git a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql index 84fcb36391..714bdc735a 100644 --- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql +++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name OPERATOR(pg_catalog.=) 'server_version_num'::pg_catalog.text) THEN EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}'; EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}'; END IF; diff --git a/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql b/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql index 0c81cef1c4..b5b209ef5e 100644 --- a/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql @@ -2,7 +2,7 @@ DO $$ DECLARE bypassrls boolean; BEGIN - SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser'; + SELECT rolbypassrls INTO bypassrls FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser'; IF NOT bypassrls THEN RAISE EXCEPTION 'neon_superuser cannot bypass RLS'; END IF; diff --git a/compute_tools/src/migrations/tests/0002-alter_roles.sql b/compute_tools/src/migrations/tests/0002-alter_roles.sql index 433f7b34f7..1755c9088c 100644 --- a/compute_tools/src/migrations/tests/0002-alter_roles.sql +++ b/compute_tools/src/migrations/tests/0002-alter_roles.sql @@ -4,8 +4,8 @@ DECLARE BEGIN FOR role IN SELECT rolname AS name, rolinherit AS inherit - FROM pg_roles - WHERE pg_has_role(rolname, 'neon_superuser', 'member') + FROM pg_catalog.pg_roles + WHERE pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member') LOOP IF NOT role.inherit THEN RAISE EXCEPTION '% cannot inherit', quote_ident(role.name); @@ -14,12 +14,12 @@ BEGIN FOR role IN SELECT rolname AS name, rolbypassrls AS bypassrls - FROM pg_roles - WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member') - AND NOT starts_with(rolname, 'pg_') + FROM pg_catalog.pg_roles + WHERE NOT pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member') + AND NOT pg_catalog.starts_with(rolname, 'pg_') LOOP IF role.bypassrls THEN - RAISE EXCEPTION '% can bypass RLS', quote_ident(role.name); + RAISE EXCEPTION '% can bypass RLS', pg_catalog.quote_ident(role.name); END IF; END LOOP; END $$; diff --git a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql index b164d61295..498770f4fa 100644 --- a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql @@ -1,10 +1,10 @@ DO $$ BEGIN - IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN + IF (SELECT pg_catalog.current_setting('server_version_num')::pg_catalog.numeric < 160000) THEN RETURN; END IF; - IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN + IF NOT (SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription'; END IF; END $$; diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql index 3464a2b1cf..ec04cfe199 100644 --- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql @@ -2,12 +2,12 @@ DO $$ DECLARE monitor record; BEGIN - SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member, + SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member, admin_option AS admin INTO monitor - FROM pg_auth_members - WHERE roleid = 'pg_monitor'::regrole - AND member = 'neon_superuser'::regrole; + FROM pg_catalog.pg_auth_members + WHERE roleid = 'pg_monitor'::pg_catalog.regrole + AND member = 'neon_superuser'::pg_catalog.regrole; IF monitor IS NULL THEN RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor'; diff --git a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql index af7f50e95d..f3b28d76c9 100644 --- a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql @@ -2,11 +2,11 @@ DO $$ DECLARE can_execute boolean; BEGIN - SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute')) + SELECT pg_catalog.bool_and(pg_catalog.has_function_privilege('neon_superuser', oid, 'execute')) INTO can_execute - FROM pg_proc + FROM pg_catalog.pg_proc WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot') - AND pronamespace = 'pg_catalog'::regnamespace; + AND pronamespace = 'pg_catalog'::pg_catalog.regnamespace; IF NOT can_execute THEN RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot'; END IF; diff --git a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql index e55dcdc3b6..197211300b 100644 --- a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql @@ -2,9 +2,9 @@ DO $$ DECLARE can_execute boolean; BEGIN - SELECT has_function_privilege('neon_superuser', oid, 'execute') + SELECT pg_catalog.has_function_privilege('neon_superuser', oid, 'execute') INTO can_execute - FROM pg_proc + FROM pg_catalog.pg_proc WHERE proname = 'pg_show_replication_origin_status' AND pronamespace = 'pg_catalog'::regnamespace; IF NOT can_execute THEN diff --git a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql index e62b742d30..0f772d67bd 100644 --- a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql +++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql @@ -2,10 +2,10 @@ DO $$ DECLARE signal_backend record; BEGIN - SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member, + SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member, admin_option AS admin INTO signal_backend - FROM pg_auth_members + FROM pg_catalog.pg_auth_members WHERE roleid = 'pg_signal_backend'::regrole AND member = 'neon_superuser'::regrole; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index e164f15dba..78ac423a9b 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -407,9 +407,9 @@ fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> { // like `postgres_exporter` use it to query Postgres statistics. // Use explicit 8 bytes type casts to match Rust types. let stats = cli.query_one( - "SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time, - coalesce(sum(sessions), 0)::bigint AS total_sessions - FROM pg_stat_database + "SELECT pg_catalog.coalesce(pg_catalog.sum(active_time), 0.0)::pg_catalog.float8 AS total_active_time, + pg_catalog.coalesce(pg_catalog.sum(sessions), 0)::pg_catalog.bigint AS total_sessions + FROM pg_catalog.pg_stat_database WHERE datname NOT IN ( 'postgres', 'template0', @@ -445,11 +445,11 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result> = None; // Get all running client backends except ourself, use RFC3339 DateTime format. let backends = cli.query( - "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change + "SELECT state, pg_catalog.to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"'::pg_catalog.text) AS state_change FROM pg_stat_activity - WHERE backend_type = 'client backend' - AND pid != pg_backend_pid() - AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? + WHERE backend_type OPERATOR(pg_catalog.=) 'client backend'::pg_catalog.text + AND pid OPERATOR(pg_catalog.!=) pg_catalog.pg_backend_pid() + AND usename OPERATOR(pg_catalog.!=) 'cloud_admin'::pg_catalog.name;", // XXX: find a better way to filter other monitors? &[], ); diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 09bbe89b41..4e16a75181 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -299,9 +299,9 @@ pub async fn get_existing_dbs_async( .query_raw::( "SELECT datname AS name, - (SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner, + (SELECT rolname FROM pg_catalog.pg_roles WHERE oid OPERATOR(pg_catalog.=) datdba) AS owner, NOT datallowconn AS restrict_conn, - datconnlimit = - 2 AS invalid + datconnlimit OPERATOR(pg_catalog.=) (OPERATOR(pg_catalog.-) 2) AS invalid FROM pg_catalog.pg_database;", &[], diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 00f34c1f0e..90c0e234d5 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -82,7 +82,7 @@ impl ComputeNode { info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); drop_subscriptions_done = match - client.query("select 1 from neon.drop_subscriptions_done where timeline_id = $1", &[&timeline_id.to_string()]).await { + client.query("select 1 from neon.drop_subscriptions_done where timeline_id OPERATOR(pg_catalog.=) $1", &[&timeline_id.to_string()]).await { Ok(result) => !result.is_empty(), Err(e) => { @@ -1142,7 +1142,9 @@ async fn get_operations<'a>( if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { if libs.contains("pg_stat_statements") { return Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"), + query: String::from( + "CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA public", + ), comment: Some(String::from("create system extensions")), }))); } @@ -1150,11 +1152,13 @@ async fn get_operations<'a>( Ok(Box::new(empty())) } ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"), + query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit WITH SCHEMA public"), comment: Some(String::from("create pgaudit extensions")), }))), ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"), + query: String::from( + "CREATE EXTENSION IF NOT EXISTS pgauditlogtofile WITH SCHEMA public", + ), comment: Some(String::from("create pgauditlogtofile extensions")), }))), // Disable pgaudit logging for postgres database. @@ -1178,7 +1182,7 @@ async fn get_operations<'a>( }, Operation { query: String::from( - "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'", + "UPDATE pg_catalog.pg_extension SET extrelocatable = true WHERE extname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name AND extrelocatable OPERATOR(pg_catalog.=) false", ), comment: Some(String::from("compat/fix: make neon relocatable")), }, diff --git a/compute_tools/src/sql/add_availabilitycheck_tables.sql b/compute_tools/src/sql/add_availabilitycheck_tables.sql index 7c60690c78..dd27105e16 100644 --- a/compute_tools/src/sql/add_availabilitycheck_tables.sql +++ b/compute_tools/src/sql/add_availabilitycheck_tables.sql @@ -3,16 +3,17 @@ BEGIN IF NOT EXISTS( SELECT 1 FROM pg_catalog.pg_tables - WHERE tablename = 'health_check' + WHERE tablename::pg_catalog.name OPERATOR(pg_catalog.=) 'health_check'::pg_catalog.name + AND schemaname::pg_catalog.name OPERATOR(pg_catalog.=) 'public'::pg_catalog.name ) THEN - CREATE TABLE health_check ( - id serial primary key, - updated_at timestamptz default now() + CREATE TABLE public.health_check ( + id pg_catalog.int4 primary key generated by default as identity, + updated_at pg_catalog.timestamptz default pg_catalog.now() ); - INSERT INTO health_check VALUES (1, now()) + INSERT INTO public.health_check VALUES (1, pg_catalog.now()) ON CONFLICT (id) DO UPDATE - SET updated_at = now(); + SET updated_at = pg_catalog.now(); END IF; END $$ \ No newline at end of file diff --git a/compute_tools/src/sql/anon_ext_fn_reassign.sql b/compute_tools/src/sql/anon_ext_fn_reassign.sql index 3d7b15c590..714a6db1e9 100644 --- a/compute_tools/src/sql/anon_ext_fn_reassign.sql +++ b/compute_tools/src/sql/anon_ext_fn_reassign.sql @@ -2,10 +2,11 @@ DO $$ DECLARE query varchar; BEGIN - FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};' - FROM pg_proc p - JOIN pg_namespace nsp ON p.pronamespace = nsp.oid - WHERE nsp.nspname = 'anon' LOOP + FOR query IN + SELECT pg_catalog.format('ALTER FUNCTION %I(%s) OWNER TO {db_owner};', p.oid::regproc, pg_catalog.pg_get_function_identity_arguments(p.oid)) + FROM pg_catalog.pg_proc p + WHERE p.pronamespace OPERATOR(pg_catalog.=) 'anon'::regnamespace::oid + LOOP EXECUTE query; END LOOP; END diff --git a/compute_tools/src/sql/create_privileged_role.sql b/compute_tools/src/sql/create_privileged_role.sql index ac2521445f..a682089cce 100644 --- a/compute_tools/src/sql/create_privileged_role.sql +++ b/compute_tools/src/sql/create_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN - IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}') + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) '{privileged_role_name}'::pg_catalog.name) THEN CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data; END IF; diff --git a/compute_tools/src/sql/default_grants.sql b/compute_tools/src/sql/default_grants.sql index 58ebb0690b..d572332270 100644 --- a/compute_tools/src/sql/default_grants.sql +++ b/compute_tools/src/sql/default_grants.sql @@ -4,14 +4,14 @@ $$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public' ) AND - current_setting('server_version_num')::int / 10000 >= 15 + pg_catalog.current_setting('server_version_num')::int OPERATOR(pg_catalog./) 10000 OPERATOR(pg_catalog.>=) 15 THEN IF EXISTS( SELECT rolname FROM pg_catalog.pg_roles - WHERE rolname = 'web_access' + WHERE rolname OPERATOR(pg_catalog.=) 'web_access' ) THEN GRANT CREATE ON SCHEMA public TO web_access; @@ -20,7 +20,7 @@ $$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public' ) THEN ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION; diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql index f5d9420130..68b3f8b729 100644 --- a/compute_tools/src/sql/drop_subscriptions.sql +++ b/compute_tools/src/sql/drop_subscriptions.sql @@ -2,11 +2,17 @@ DO ${outer_tag}$ DECLARE subname TEXT; BEGIN - LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE; - FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP - EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname); - EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); - EXECUTE format('DROP SUBSCRIPTION %I;', subname); + LOCK TABLE pg_catalog.pg_subscription IN ACCESS EXCLUSIVE MODE; + FOR subname IN + SELECT pg_subscription.subname + FROM pg_catalog.pg_subscription + WHERE subdbid OPERATOR(pg_catalog.=) ( + SELECT oid FROM pg_database WHERE datname OPERATOR(pg_catalog.=) {datname_str}::pg_catalog.name + ) + LOOP + EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I DISABLE;', subname); + EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); + EXECUTE pg_catalog.format('DROP SUBSCRIPTION %I;', subname); END LOOP; END; ${outer_tag}$; diff --git a/compute_tools/src/sql/finalize_drop_subscriptions.sql b/compute_tools/src/sql/finalize_drop_subscriptions.sql index 4bb291876f..1a8876ad61 100644 --- a/compute_tools/src/sql/finalize_drop_subscriptions.sql +++ b/compute_tools/src/sql/finalize_drop_subscriptions.sql @@ -3,19 +3,19 @@ BEGIN IF NOT EXISTS( SELECT 1 FROM pg_catalog.pg_tables - WHERE tablename = 'drop_subscriptions_done' - AND schemaname = 'neon' + WHERE tablename OPERATOR(pg_catalog.=) 'drop_subscriptions_done'::pg_catalog.name + AND schemaname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name ) THEN CREATE TABLE neon.drop_subscriptions_done - (id serial primary key, timeline_id text); + (id pg_catalog.int4 primary key generated by default as identity, timeline_id pg_catalog.text); END IF; -- preserve the timeline_id of the last drop_subscriptions run -- to ensure that the cleanup of a timeline is executed only once. -- use upsert to avoid the table bloat in case of cascade branching (branch of a branch) - INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id')) + INSERT INTO neon.drop_subscriptions_done VALUES (1, pg_catalog.current_setting('neon.timeline_id')) ON CONFLICT (id) DO UPDATE - SET timeline_id = current_setting('neon.timeline_id'); + SET timeline_id = pg_catalog.current_setting('neon.timeline_id')::pg_catalog.text; END $$ diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql index 734607be02..2ed0f94bad 100644 --- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql +++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql @@ -15,15 +15,15 @@ BEGIN WHERE schema_name IN ('public') LOOP FOR grantor IN EXECUTE - format( - 'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s', + pg_catalog.format( + 'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee OPERATOR(pg_catalog.=) %s', -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` quote_literal({role_name}) ) LOOP - EXECUTE format('SET LOCAL ROLE %I', grantor); + EXECUTE pg_catalog.format('SET LOCAL ROLE %I', grantor); - revoke_query := format( + revoke_query := pg_catalog.format( 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I', schema, -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql index dc502c6d2d..41bd0d4689 100644 --- a/compute_tools/src/sql/set_public_schema_owner.sql +++ b/compute_tools/src/sql/set_public_schema_owner.sql @@ -5,17 +5,17 @@ DO ${outer_tag}$ IF EXISTS( SELECT nspname FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.name ) THEN SELECT nspowner::regrole::text FROM pg_catalog.pg_namespace - WHERE nspname = 'public' + WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.text INTO schema_owner; - IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin' + IF schema_owner OPERATOR(pg_catalog.=) 'cloud_admin'::pg_catalog.text OR schema_owner OPERATOR(pg_catalog.=) 'zenith_admin'::pg_catalog.text THEN - EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner}); + EXECUTE pg_catalog.format('ALTER SCHEMA public OWNER TO %I', {db_owner}); END IF; END IF; END diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql index 36dc648beb..03225d5e64 100644 --- a/compute_tools/src/sql/unset_template_for_drop_dbs.sql +++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql @@ -3,10 +3,10 @@ DO ${outer_tag}$ IF EXISTS( SELECT 1 FROM pg_catalog.pg_database - WHERE datname = {datname} + WHERE datname OPERATOR(pg_catalog.=) {datname}::pg_catalog.name ) THEN - EXECUTE format('ALTER DATABASE %I is_template false', {datname}); + EXECUTE pg_catalog.format('ALTER DATABASE %I is_template false', {datname}); END IF; END ${outer_tag}$; From eb2741758bbf97616746b16831a72c4531791ee9 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 30 Jul 2025 18:18:35 +0200 Subject: [PATCH 26/34] storcon: actually update gRPC address on reattach (#12784) ## Problem In #12268, we added Pageserver gRPC addresses to the storage controller. However, we didn't actually persist these in the database. ## Summary of changes Update the database with the new gRPC address on reattach. --- storage_controller/src/persistence.rs | 8 +++++++- storage_controller/src/service.rs | 4 +++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 619b5f69b8..c61ef9ff5d 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -471,11 +471,17 @@ impl Persistence { &self, input_node_id: NodeId, input_https_port: Option, + input_grpc_addr: Option, + input_grpc_port: Option, ) -> DatabaseResult<()> { use crate::schema::nodes::dsl::*; self.update_node( input_node_id, - listen_https_port.eq(input_https_port.map(|x| x as i32)), + ( + listen_https_port.eq(input_https_port.map(|x| x as i32)), + listen_grpc_addr.eq(input_grpc_addr), + listen_grpc_port.eq(input_grpc_port.map(|x| x as i32)), + ), ) .await } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 37380b8fbe..2c7e4ee2de 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -7813,7 +7813,7 @@ impl Service { register_req.listen_https_port, register_req.listen_pg_addr, register_req.listen_pg_port, - register_req.listen_grpc_addr, + register_req.listen_grpc_addr.clone(), register_req.listen_grpc_port, register_req.availability_zone_id.clone(), self.config.use_https_pageserver_api, @@ -7848,6 +7848,8 @@ impl Service { .update_node_on_registration( register_req.node_id, register_req.listen_https_port, + register_req.listen_grpc_addr, + register_req.listen_grpc_port, ) .await? } From e470997627b25ae6494ffe5ae4abbd35efd7acf6 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:10:33 -0400 Subject: [PATCH 27/34] enable tests introduced in hadron commits (#12790) Enables skipped tests introduced in hadron integration commits --- test_runner/regress/test_compaction.py | 5 +++-- test_runner/regress/test_wal_acceptor.py | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index a780aa2e52..94c18ac548 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -863,7 +863,6 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*") -@pytest.mark.skip(reason="Lakebase mode") def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder): """ Test that when the pageserver detects corruption during image layer creation, @@ -890,7 +889,9 @@ def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder): timeline_id = env.initial_timeline pageserver_http = env.pageserver.http_client() - workload = Workload(env, tenant_id, timeline_id) + workload = Workload( + env, tenant_id, timeline_id, endpoint_opts={"config_lines": ["neon.lakebase_mode=true"]} + ) workload.init() # Enable the failpoint that will cause image layer creation to fail due to a (simulated) detected diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 33d308fb5a..1e8ca216d0 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2742,7 +2742,6 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde wait_until(unevicted) -@pytest.mark.skip(reason="Lakebase mode") def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): """ Test that the timeline disk usage circuit breaker works as expected. We test that: @@ -2762,7 +2761,12 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): # Create a timeline and endpoint env.create_branch("test_timeline_disk_usage_limit") - endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit") + endpoint = env.endpoints.create_start( + "test_timeline_disk_usage_limit", + config_lines=[ + "neon.lakebase_mode=true", + ], + ) # Install the neon extension in the test database. We need it to query perf counter metrics. with closing(endpoint.connect()) as conn: From 81ddd10be6d18da39332bf51c48a99017ce6f9df Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 30 Jul 2025 22:56:22 +0300 Subject: [PATCH 28/34] tests: Don't print Hostname on every test connection (#12782) These lines are a significant fraction of the total log size of the regression tests. And it seems very uninteresting, it's always 'localhost' in local tests. --- test_runner/fixtures/neon_fixtures.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b5148cbfdc..11d54f7831 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -262,7 +262,6 @@ class PgProtocol: # pooler does not support statement_timeout # Check if the hostname contains the string 'pooler' hostname = result.get("host", "") - log.info(f"Hostname: {hostname}") options = result.get("options", "") if "statement_timeout" not in options and "pooler" not in hostname: options = f"-cstatement_timeout=120s {options}" From 4d3b28bd2eecaf38b8700281e17d027b7920fdc3 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 30 Jul 2025 23:34:30 +0200 Subject: [PATCH 29/34] [Hadron] Always run databricks auth hook. (#12683) --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c9f9fdd011..2155cb165d 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c9f9fdd0113b52c0bd535afdb09d3a543aeee25f +Subproject commit 2155cb165d05f617eb2c8ad7e43367189b627703 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index aaaeff2550..2aaab3bb4a 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit aaaeff2550d5deba58847f112af9b98fa3a58b00 +Subproject commit 2aaab3bb4a13557aae05bb2ae0ef0a132d0c4f85 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 9b9cb4b3e3..a42351fcd4 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 9b9cb4b3e33347aea8f61e606bb6569979516de5 +Subproject commit a42351fcd41ea01edede1daed65f651e838988fc diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index fa1788475e..1e01fcea2a 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit fa1788475e3146cc9c7c6a1b74f48fd296898fcd +Subproject commit 1e01fcea2a6b38180021aa83e0051d95286d9096 diff --git a/vendor/revisions.json b/vendor/revisions.json index 7212c9f7c7..c02c748a72 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.5", - "fa1788475e3146cc9c7c6a1b74f48fd296898fcd" + "1e01fcea2a6b38180021aa83e0051d95286d9096" ], "v16": [ "16.9", - "9b9cb4b3e33347aea8f61e606bb6569979516de5" + "a42351fcd41ea01edede1daed65f651e838988fc" ], "v15": [ "15.13", - "aaaeff2550d5deba58847f112af9b98fa3a58b00" + "2aaab3bb4a13557aae05bb2ae0ef0a132d0c4f85" ], "v14": [ "14.18", - "c9f9fdd0113b52c0bd535afdb09d3a543aeee25f" + "2155cb165d05f617eb2c8ad7e43367189b627703" ] } From 01c39f378e0a81037ead9108ea83feb146310de8 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 30 Jul 2025 23:05:51 +0100 Subject: [PATCH 30/34] prewarm cancellation (#12785) Add DELETE /lfc/prewarm route which handles ongoing prewarm cancellation, update API spec, add prewarm Cancelled state Add offload Cancelled state when LFC is not initialized --- compute_tools/src/compute.rs | 3 + compute_tools/src/compute_prewarm.rs | 141 ++++++++++++++--------- compute_tools/src/http/openapi_spec.yaml | 11 +- compute_tools/src/http/routes/lfc.rs | 5 + compute_tools/src/http/server.rs | 7 +- libs/compute_api/src/responses.rs | 6 + test_runner/fixtures/endpoint/http.py | 27 +++-- test_runner/regress/test_lfc_prewarm.py | 94 ++++++++++++--- 8 files changed, 208 insertions(+), 86 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index f553bf3c1e..5b1cdb9805 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -32,6 +32,7 @@ use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; use tokio::{spawn, sync::watch, task::JoinHandle, time}; +use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, instrument, warn}; use url::Url; use utils::id::{TenantId, TimelineId}; @@ -192,6 +193,7 @@ pub struct ComputeState { pub startup_span: Option, pub lfc_prewarm_state: LfcPrewarmState, + pub lfc_prewarm_token: CancellationToken, pub lfc_offload_state: LfcOffloadState, /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if @@ -217,6 +219,7 @@ impl ComputeState { lfc_offload_state: LfcOffloadState::default(), terminate_flush_lsn: None, promote_state: None, + lfc_prewarm_token: CancellationToken::new(), } } diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs index 97e62c1c80..82cb28f1ac 100644 --- a/compute_tools/src/compute_prewarm.rs +++ b/compute_tools/src/compute_prewarm.rs @@ -7,7 +7,8 @@ use http::StatusCode; use reqwest::Client; use std::mem::replace; use std::sync::Arc; -use tokio::{io::AsyncReadExt, spawn}; +use tokio::{io::AsyncReadExt, select, spawn}; +use tokio_util::sync::CancellationToken; use tracing::{error, info}; #[derive(serde::Serialize, Default)] @@ -92,34 +93,35 @@ impl ComputeNode { /// If there is a prewarm request ongoing, return `false`, `true` otherwise. /// Has a failpoint "compute-prewarm" pub fn prewarm_lfc(self: &Arc, from_endpoint: Option) -> bool { + let token: CancellationToken; { - let state = &mut self.state.lock().unwrap().lfc_prewarm_state; - if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) { + let state = &mut self.state.lock().unwrap(); + token = state.lfc_prewarm_token.clone(); + if let LfcPrewarmState::Prewarming = + replace(&mut state.lfc_prewarm_state, LfcPrewarmState::Prewarming) + { return false; } } crate::metrics::LFC_PREWARMS.inc(); - let cloned = self.clone(); + let this = self.clone(); spawn(async move { - let state = match cloned.prewarm_impl(from_endpoint).await { - Ok(true) => LfcPrewarmState::Completed, - Ok(false) => { - info!( - "skipping LFC prewarm because LFC state is not found in endpoint storage" - ); - LfcPrewarmState::Skipped - } + let prewarm_state = match this.prewarm_impl(from_endpoint, token).await { + Ok(state) => state, Err(err) => { crate::metrics::LFC_PREWARM_ERRORS.inc(); error!(%err, "could not prewarm LFC"); - LfcPrewarmState::Failed { - error: format!("{err:#}"), - } + let error = format!("{err:#}"); + LfcPrewarmState::Failed { error } } }; - cloned.state.lock().unwrap().lfc_prewarm_state = state; + let state = &mut this.state.lock().unwrap(); + if let LfcPrewarmState::Cancelled = prewarm_state { + state.lfc_prewarm_token = CancellationToken::new(); + } + state.lfc_prewarm_state = prewarm_state; }); true } @@ -132,47 +134,70 @@ impl ComputeNode { /// Request LFC state from endpoint storage and load corresponding pages into Postgres. /// Returns a result with `false` if the LFC state is not found in endpoint storage. - async fn prewarm_impl(&self, from_endpoint: Option) -> Result { - let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?; + async fn prewarm_impl( + &self, + from_endpoint: Option, + token: CancellationToken, + ) -> Result { + let EndpointStoragePair { + url, + token: storage_token, + } = self.endpoint_storage_pair(from_endpoint)?; #[cfg(feature = "testing")] - fail::fail_point!("compute-prewarm", |_| { - bail!("prewarm configured to fail because of a failpoint") - }); + fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint")); info!(%url, "requesting LFC state from endpoint storage"); - let request = Client::new().get(&url).bearer_auth(token); - let res = request.send().await.context("querying endpoint storage")?; - match res.status() { + let request = Client::new().get(&url).bearer_auth(storage_token); + let response = select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + response = request.send() => response + } + .context("querying endpoint storage")?; + + match response.status() { StatusCode::OK => (), - StatusCode::NOT_FOUND => { - return Ok(false); - } + StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped), status => bail!("{status} querying endpoint storage"), } let mut uncompressed = Vec::new(); - let lfc_state = res - .bytes() - .await - .context("getting request body from endpoint storage")?; - ZstdDecoder::new(lfc_state.iter().as_slice()) - .read_to_end(&mut uncompressed) - .await - .context("decoding LFC state")?; + let lfc_state = select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + lfc_state = response.bytes() => lfc_state + } + .context("getting request body from endpoint storage")?; + + let mut decoder = ZstdDecoder::new(lfc_state.iter().as_slice()); + select! { + _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled), + read = decoder.read_to_end(&mut uncompressed) => read + } + .context("decoding LFC state")?; + let uncompressed_len = uncompressed.len(); + info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}"); - info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres"); - - ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + // Client connection and prewarm info querying are fast and therefore don't need + // cancellation + let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await - .context("connecting to postgres")? - .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed]) - .await - .context("loading LFC state into postgres") - .map(|_| ())?; + .context("connecting to postgres")?; + let pg_token = client.cancel_token(); - Ok(true) + let params: Vec<&(dyn postgres_types::ToSql + Sync)> = vec![&uncompressed]; + select! { + res = client.query_one("select neon.prewarm_local_cache($1)", ¶ms) => res, + _ = token.cancelled() => { + pg_token.cancel_query(postgres::NoTls).await + .context("cancelling neon.prewarm_local_cache()")?; + return Ok(LfcPrewarmState::Cancelled) + } + } + .context("loading LFC state into postgres") + .map(|_| ())?; + + Ok(LfcPrewarmState::Completed) } /// If offload request is ongoing, return false, true otherwise @@ -200,20 +225,20 @@ impl ComputeNode { async fn offload_lfc_with_state_update(&self) { crate::metrics::LFC_OFFLOADS.inc(); - - let Err(err) = self.offload_lfc_impl().await else { - self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; - return; + let state = match self.offload_lfc_impl().await { + Ok(state) => state, + Err(err) => { + crate::metrics::LFC_OFFLOAD_ERRORS.inc(); + error!(%err, "could not offload LFC"); + let error = format!("{err:#}"); + LfcOffloadState::Failed { error } + } }; - crate::metrics::LFC_OFFLOAD_ERRORS.inc(); - error!(%err, "could not offload LFC state to endpoint storage"); - self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { - error: format!("{err:#}"), - }; + self.state.lock().unwrap().lfc_offload_state = state; } - async fn offload_lfc_impl(&self) -> Result<()> { + async fn offload_lfc_impl(&self) -> Result { let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?; info!(%url, "requesting LFC state from Postgres"); @@ -228,7 +253,7 @@ impl ComputeNode { .context("deserializing LFC state")?; let Some(state) = state else { info!(%url, "empty LFC state, not exporting"); - return Ok(()); + return Ok(LfcOffloadState::Skipped); }; let mut compressed = Vec::new(); @@ -242,7 +267,7 @@ impl ComputeNode { let request = Client::new().put(url).bearer_auth(token).body(compressed); match request.send().await { - Ok(res) if res.status() == StatusCode::OK => Ok(()), + Ok(res) if res.status() == StatusCode::OK => Ok(LfcOffloadState::Completed), Ok(res) => bail!( "Request to endpoint storage failed with status: {}", res.status() @@ -250,4 +275,8 @@ impl ComputeNode { Err(err) => Err(err).context("writing to endpoint storage"), } } + + pub fn cancel_prewarm(self: &Arc) { + self.state.lock().unwrap().lfc_prewarm_token.cancel(); + } } diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index ab729d62b5..27e610a87d 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -139,6 +139,15 @@ paths: application/json: schema: $ref: "#/components/schemas/LfcPrewarmState" + delete: + tags: + - Prewarm + summary: Cancel ongoing LFC prewarm + description: "" + operationId: cancelLfcPrewarm + responses: + 202: + description: Prewarm cancelled /lfc/offload: post: @@ -636,7 +645,7 @@ components: properties: status: description: LFC offload status - enum: [not_offloaded, offloading, completed, failed] + enum: [not_offloaded, offloading, completed, skipped, failed] type: string error: description: LFC offload error, if any diff --git a/compute_tools/src/http/routes/lfc.rs b/compute_tools/src/http/routes/lfc.rs index e98bd781a2..7483198723 100644 --- a/compute_tools/src/http/routes/lfc.rs +++ b/compute_tools/src/http/routes/lfc.rs @@ -46,3 +46,8 @@ pub(in crate::http) async fn offload(compute: Compute) -> Response { ) } } + +pub(in crate::http) async fn cancel_prewarm(compute: Compute) -> StatusCode { + compute.cancel_prewarm(); + StatusCode::ACCEPTED +} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 2fd3121f4f..869fdef11d 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -99,7 +99,12 @@ impl From<&Server> for Router> { ); let authenticated_router = Router::>::new() - .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) + .route( + "/lfc/prewarm", + get(lfc::prewarm_state) + .post(lfc::prewarm) + .delete(lfc::cancel_prewarm), + ) .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) .route("/promote", post(promote::promote)) .route("/check_writability", post(check_writability::is_writable)) diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index a27301e45e..a918644e4c 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -68,11 +68,15 @@ pub enum LfcPrewarmState { /// We tried to fetch the corresponding LFC state from the endpoint storage, /// but received `Not Found 404`. This should normally happen only during the /// first endpoint start after creation with `autoprewarm: true`. + /// This may also happen if LFC is turned off or not initialized /// /// During the orchestrated prewarm via API, when a caller explicitly /// provides the LFC state key to prewarm from, it's the caller responsibility /// to handle this status as an error state in this case. Skipped, + /// LFC prewarm was cancelled. Some pages in LFC cache may be prewarmed if query + /// has started working before cancellation + Cancelled, } impl Display for LfcPrewarmState { @@ -83,6 +87,7 @@ impl Display for LfcPrewarmState { LfcPrewarmState::Completed => f.write_str("Completed"), LfcPrewarmState::Skipped => f.write_str("Skipped"), LfcPrewarmState::Failed { error } => write!(f, "Error({error})"), + LfcPrewarmState::Cancelled => f.write_str("Cancelled"), } } } @@ -97,6 +102,7 @@ pub enum LfcOffloadState { Failed { error: String, }, + Skipped, } #[derive(Serialize, Debug, Clone, PartialEq)] diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index d235ac2143..c77a372017 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -78,20 +78,26 @@ class EndpointHttpClient(requests.Session): json: dict[str, str] = res.json() return json - def prewarm_lfc(self, from_endpoint_id: str | None = None): + def prewarm_lfc(self, from_endpoint_id: str | None = None) -> dict[str, str]: """ Prewarm LFC cache from given endpoint and wait till it finishes or errors """ params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict() self.post(self.prewarm_url, params=params).raise_for_status() - self.prewarm_lfc_wait() + return self.prewarm_lfc_wait() - def prewarm_lfc_wait(self): + def cancel_prewarm_lfc(self): + """ + Cancel LFC prewarm if any is ongoing + """ + self.delete(self.prewarm_url).raise_for_status() + + def prewarm_lfc_wait(self) -> dict[str, str]: """ Wait till LFC prewarm returns with error or success. If prewarm was not requested before calling this function, it will error """ - statuses = "failed", "completed", "skipped" + statuses = "failed", "completed", "skipped", "cancelled" def prewarmed(): json = self.prewarm_lfc_status() @@ -101,6 +107,7 @@ class EndpointHttpClient(requests.Session): wait_until(prewarmed, timeout=60) res = self.prewarm_lfc_status() assert res["status"] != "failed", res + return res def offload_lfc_status(self) -> dict[str, str]: res = self.get(self.offload_url) @@ -108,29 +115,31 @@ class EndpointHttpClient(requests.Session): json: dict[str, str] = res.json() return json - def offload_lfc(self): + def offload_lfc(self) -> dict[str, str]: """ Offload LFC cache to endpoint storage and wait till offload finishes or errors """ self.post(self.offload_url).raise_for_status() - self.offload_lfc_wait() + return self.offload_lfc_wait() - def offload_lfc_wait(self): + def offload_lfc_wait(self) -> dict[str, str]: """ Wait till LFC offload returns with error or success. If offload was not requested before calling this function, it will error """ + statuses = "failed", "completed", "skipped" def offloaded(): json = self.offload_lfc_status() status, err = json["status"], json.get("error") - assert status in ["failed", "completed"], f"{status}, {err=}" + assert status in statuses, f"{status}, {err=}" wait_until(offloaded, timeout=60) res = self.offload_lfc_status() assert res["status"] != "failed", res + return res - def promote(self, promote_spec: dict[str, Any], disconnect: bool = False): + def promote(self, promote_spec: dict[str, Any], disconnect: bool = False) -> dict[str, str]: url = f"http://localhost:{self.external_port}/promote" if disconnect: try: # send first request to start promote and disconnect diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py index 2bbe8c3e97..a96f18177c 100644 --- a/test_runner/regress/test_lfc_prewarm.py +++ b/test_runner/regress/test_lfc_prewarm.py @@ -1,6 +1,6 @@ import random -import threading from enum import StrEnum +from threading import Thread from time import sleep from typing import Any @@ -47,19 +47,23 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) # With autoprewarm, we need to be sure LFC was offloaded after all writes # finish, so we sleep. Otherwise we'll have less prewarmed pages than we want sleep(AUTOOFFLOAD_INTERVAL_SECS) - client.offload_lfc_wait() - return + offload_res = client.offload_lfc_wait() + log.info(offload_res) + return offload_res if method == PrewarmMethod.COMPUTE_CTL: status = client.prewarm_lfc_status() assert status["status"] == "not_prewarmed" assert "error" not in status - client.offload_lfc() + offload_res = client.offload_lfc() + log.info(offload_res) assert client.prewarm_lfc_status()["status"] == "not_prewarmed" + parsed = prom_parse(client) desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0} assert parsed == desired, f"{parsed=} != {desired=}" - return + + return offload_res raise AssertionError(f"{method} not in PrewarmMethod") @@ -68,21 +72,30 @@ def prewarm_endpoint( method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None ): if method == PrewarmMethod.AUTOPREWARM: - client.prewarm_lfc_wait() + prewarm_res = client.prewarm_lfc_wait() + log.info(prewarm_res) elif method == PrewarmMethod.COMPUTE_CTL: - client.prewarm_lfc() + prewarm_res = client.prewarm_lfc() + log.info(prewarm_res) + return prewarm_res elif method == PrewarmMethod.POSTGRES: cur.execute("select neon.prewarm_local_cache(%s)", (lfc_state,)) -def check_prewarmed( +def check_prewarmed_contains( method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int] ): if method == PrewarmMethod.AUTOPREWARM: - assert client.prewarm_lfc_status() == desired_status + prewarm_status = client.prewarm_lfc_status() + for k in desired_status: + assert desired_status[k] == prewarm_status[k] + assert prom_parse(client)[PREWARM_LABEL] == 1 elif method == PrewarmMethod.COMPUTE_CTL: - assert client.prewarm_lfc_status() == desired_status + prewarm_status = client.prewarm_lfc_status() + for k in desired_status: + assert desired_status[k] == prewarm_status[k] + desired = {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1, PREWARM_ERR_LABEL: 0, OFFLOAD_ERR_LABEL: 0} assert prom_parse(client) == desired @@ -149,9 +162,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): log.info(f"Used LFC size: {lfc_used_pages}") pg_cur.execute("select * from neon.get_prewarm_info()") total, prewarmed, skipped, _ = pg_cur.fetchall()[0] - log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}") - progress = (prewarmed + skipped) * 100 // total - log.info(f"Prewarm progress: {progress}%") assert lfc_used_pages > 10000 assert total > 0 assert prewarmed > 0 @@ -161,7 +171,54 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped} - check_prewarmed(method, client, desired) + check_prewarmed_contains(method, client, desired) + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_lfc_prewarm_cancel(neon_simple_env: NeonEnv): + """ + Test we can cancel LFC prewarm and prewarm successfully after + """ + env = neon_simple_env + n_records = 1000000 + cfg = [ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000", + ] + endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg) + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create schema neon; create extension neon with schema neon") + pg_cur.execute("create database lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + log.info(f"Inserting {n_records} rows") + lfc_cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))") + lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + client = endpoint.http_client() + method = PrewarmMethod.COMPUTE_CTL + offload_lfc(method, client, pg_cur) + + endpoint.stop() + endpoint.start() + + thread = Thread(target=lambda: prewarm_endpoint(method, client, pg_cur, None)) + thread.start() + # wait 2 seconds to ensure we cancel prewarm SQL query + sleep(2) + client.cancel_prewarm_lfc() + thread.join() + assert client.prewarm_lfc_status()["status"] == "cancelled" + + prewarm_endpoint(method, client, pg_cur, None) + assert client.prewarm_lfc_status()["status"] == "completed" @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") @@ -178,9 +235,8 @@ def test_lfc_prewarm_empty(neon_simple_env: NeonEnv): cur = conn.cursor() cur.execute("create schema neon; create extension neon with schema neon") method = PrewarmMethod.COMPUTE_CTL - offload_lfc(method, client, cur) - prewarm_endpoint(method, client, cur, None) - assert client.prewarm_lfc_status()["status"] == "skipped" + assert offload_lfc(method, client, cur)["status"] == "skipped" + assert prewarm_endpoint(method, client, cur, None)["status"] == "skipped" # autoprewarm isn't needed as we prewarm manually @@ -251,11 +307,11 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet workload_threads = [] for _ in range(n_threads): - t = threading.Thread(target=workload) + t = Thread(target=workload) workload_threads.append(t) t.start() - prewarm_thread = threading.Thread(target=prewarm) + prewarm_thread = Thread(target=prewarm) prewarm_thread.start() def prewarmed(): From 975b95f4cd9db3d25587f265f0587ba4fa123ce5 Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Thu, 31 Jul 2025 12:34:47 +0400 Subject: [PATCH 31/34] Introduce deletion API improvement RFC (#12484) ## Problem The deletion logic had become difficult to understand and maintain. ## Summary of changes - Added an RFC detailing proposed improvements to all deletion-related APIs. --------- Co-authored-by: Aleksandr Sarantsev --- ...025-07-07-node-deletion-api-improvement.md | 246 ++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 docs/rfcs/2025-07-07-node-deletion-api-improvement.md diff --git a/docs/rfcs/2025-07-07-node-deletion-api-improvement.md b/docs/rfcs/2025-07-07-node-deletion-api-improvement.md new file mode 100644 index 0000000000..47dadaee35 --- /dev/null +++ b/docs/rfcs/2025-07-07-node-deletion-api-improvement.md @@ -0,0 +1,246 @@ +# Node deletion API improvement + +Created on 2025-07-07 +Implemented on _TBD_ + +## Summary + +This RFC describes improvements to the storage controller API for gracefully deleting pageserver +nodes. + +## Motivation + +The basic node deletion API introduced in [#8226](https://github.com/neondatabase/neon/issues/8333) +has several limitations: + +- Deleted nodes can re-add themselves if they restart (e.g., a flaky node that keeps restarting and +we cannot reach via SSH to stop the pageserver). This issue has been resolved by tombstone +mechanism in [#12036](https://github.com/neondatabase/neon/issues/12036) +- Process of node deletion is not graceful, i.e. it just imitates a node failure + +In this context, "graceful" node deletion means that users do not experience any disruption or +negative effects, provided the system remains in a healthy state (i.e., the remaining pageservers +can handle the workload and all requirements are met). To achieve this, the system must perform +live migration of all tenant shards from the node being deleted while the node is still running +and continue processing all incoming requests. The node is removed only after all tenant shards +have been safely migrated. + +Although live migrations can be achieved with the drain functionality, it leads to incorrect shard +placement, such as not matching availability zones. This results in unnecessary work to optimize +the placement that was just recently performed. + +If we delete a node before its tenant shards are fully moved, the new node won't have all the +needed data (e.g. heatmaps) ready. This means user requests to the new node will be much slower at +first. If there are many tenant shards, this slowdown affects a huge amount of users. + +Graceful node deletion is more complicated and can introduce new issues. It takes longer because +live migration of each tenant shard can last several minutes. Using non-blocking accessors may +also cause deletion to wait if other processes are holding inner state lock. It also gets trickier +because we need to handle other requests, like drain and fill, at the same time. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +- storage controller +- pageserver (indirectly) + +## Proposed implementation + +### Tombstones + +To resolve the problem of deleted nodes re-adding themselves, a tombstone mechanism was introduced +as part of the node stored information. Each node has a separate `NodeLifecycle` field with two +possible states: `Active` and `Deleted`. When node deletion completes, the database row is not +deleted but instead has its `NodeLifecycle` column switched to `Deleted`. Nodes with `Deleted` +lifecycle are treated as if the row is absent for most handlers, with several exceptions: reattach +and register functionality must be aware of tombstones. Additionally, new debug handlers are +available for listing and deleting tombstones via the `/debug/v1/tombstone` path. + +### Gracefulness + +The problem of making node deletion graceful is complex and involves several challenges: + +- **Cancellable**: The operation must be cancellable to allow administrators to abort the process +if needed, e.g. if run by mistake. +- **Non-blocking**: We don't want to block deployment operations like draining/filling on the node +deletion process. We need clear policies for handling concurrent operations: what happens when a +drain/fill request arrives while deletion is in progress, and what happens when a delete request +arrives while drain/fill is in progress. +- **Persistent**: If the storage controller restarts during this long-running operation, we must +preserve progress and automatically resume the deletion process after the storage controller +restarts. +- **Migrated correctly**: We cannot simply use the existing drain mechanism for nodes scheduled +for deletion, as this would move shards to irrelevant locations. The drain process expects the +node to return, so it only moves shards to backup locations, not to their preferred AZs. It also +leaves secondary locations unmoved. This could result in unnecessary load on the storage +controller and inefficient resource utilization. +- **Force option**: Administrators need the ability to force immediate, non-graceful deletion when +time constraints or emergency situations require it, bypassing the normal graceful migration +process. + +See below for a detailed breakdown of the proposed changes and mechanisms. + +#### Node lifecycle + +New `NodeLifecycle` enum and a matching database field with these values: +- `Active`: The normal state. All operations are allowed. +- `ScheduledForDeletion`: The node is marked to be deleted soon. Deletion may be in progress or +will happen later, but the node will eventually be removed. All operations are allowed. +- `Deleted`: The node is fully deleted. No operations are allowed, and the node cannot be brought +back. The only action left is to remove its record from the database. Any attempt to register a +node in this state will fail. + +This state persists across storage controller restarts. + +**State transition** +``` + +--------------------+ + +---| Active |<---------------------+ + | +--------------------+ | + | ^ | + | start_node_delete | cancel_node_delete | + v | | + +----------------------------------+ | + | ScheduledForDeletion | | + +----------------------------------+ | + | | + | node_register | + | | + | delete_node (at the finish) | + | | + v | + +---------+ tombstone_delete +----------+ + | Deleted |-------------------------------->| no row | + +---------+ +----------+ +``` + +#### NodeSchedulingPolicy::Deleting + +A `Deleting` variant to the `NodeSchedulingPolicy` enum. This means the deletion function is +running for the node right now. Only one node can have the `Deleting` policy at a time. + +The `NodeSchedulingPolicy::Deleting` state is persisted in the database. However, after a storage +controller restart, any node previously marked as `Deleting` will have its scheduling policy reset +to `Pause`. The policy will only transition back to `Deleting` when the deletion operation is +actively started again, as triggered by the node's `NodeLifecycle::ScheduledForDeletion` state. + +`NodeSchedulingPolicy` transition details: +1. When `node_delete` begins, set the policy to `NodeSchedulingPolicy::Deleting`. +2. If `node_delete` is cancelled (for example, due to a concurrent drain operation), revert the +policy to its previous value. The policy is persisted in storcon DB. +3. After `node_delete` completes, the final value of the scheduling policy is irrelevant, since +`NodeLifecycle::Deleted` prevents any further access to this field. + +The deletion process cannot be initiated for nodes currently undergoing deployment-related +operations (`Draining`, `Filling`, or `PauseForRestart` policies). Deletion will only be triggered +once the node transitions to either the `Active` or `Pause` state. + +#### OperationTracker + +A replacement for `Option ongoing_operation`, the `OperationTracker` is a +dedicated service state object responsible for managing all long-running node operations (drain, +fill, delete) with robust concurrency control. + +Key responsibilities: +- Orchestrates the execution of operations +- Supports cancellation of currently running operations +- Enforces operation constraints, e.g. allowing only single drain/fill operation at a time +- Persists deletion state, enabling recovery of pending deletions across restarts +- Ensures thread safety across concurrent requests + +#### Attached tenant shard processing + +When deleting a node, handle each attached tenant shard as follows: + +1. Pick the best node to become the new attached (the candidate). +2. If the candidate already has this shard as a secondary: + - Create a new secondary for the shard on another suitable node. + Otherwise: + - Create a secondary for the shard on the candidate node. +3. Wait until all secondaries are ready and pre-warmed. +4. Promote the candidate's secondary to attached. +5. Remove the secondary from the node being deleted. + +This process safely moves all attached shards before deleting the node. + +#### Secondary tenant shard processing + +When deleting a node, handle each secondary tenant shard as follows: + +1. Choose the best node to become the new secondary. +2. Create a secondary for the shard on that node. +3. Wait until the new secondary is ready. +4. Remove the secondary from the node being deleted. + +This ensures all secondary shards are safely moved before deleting the node. + +### Reliability, failure modes and corner cases + +In case of a storage controller failure and following restart, the system behavior depends on the +`NodeLifecycle` state: + +- If `NodeLifecycle` is `Active`: No action is taken for this node. +- If `NodeLifecycle` is `Deleted`: The node will not be re-added. +- If `NodeLifecycle` is `ScheduledForDeletion`: A deletion background task will be launched for +this node. + +In case of a pageserver node failure during deletion, the behavior depends on the `force` flag: +- If `force` is set: The node deletion will proceed regardless of the node's availability. +- If `force` is not set: The deletion will be retried a limited number of times. If the node +remains unavailable, the deletion process will pause and automatically resume when the node +becomes healthy again. + +### Operations concurrency + +The following sections describe the behavior when different types of requests arrive at the storage +controller and how they interact with ongoing operations. + +#### Delete request + +Handler: `PUT /control/v1/node/:node_id/delete` + +1. If node lifecycle is `NodeLifecycle::ScheduledForDeletion`: + - Return `200 OK`: there is already an ongoing deletion request for this node +2. Update & persist lifecycle to `NodeLifecycle::ScheduledForDeletion` +3. Persist current scheduling policy +4. If there is no active operation (drain/fill/delete): + - Run deletion process for this node + +#### Cancel delete request + +Handler: `DELETE /control/v1/node/:node_id/delete` + +1. If node lifecycle is not `NodeLifecycle::ScheduledForDeletion`: + - Return `404 Not Found`: there is no current deletion request for this node +2. If the active operation is deleting this node, cancel it +3. Update & persist lifecycle to `NodeLifecycle::Active` +4. Restore the last scheduling policy from persistence + +#### Drain/fill request + +1. If there are already ongoing drain/fill processes: + - Return `409 Conflict`: queueing of drain/fill processes is not supported +2. If there is an ongoing delete process: + - Cancel it and wait until it is cancelled +3. Run the drain/fill process +4. After the drain/fill process is cancelled or finished: + - Try to find another candidate to delete and run the deletion process for that node + +#### Drain/fill cancel request + +1. If the active operation is not the related process: + - Return `400 Bad Request`: cancellation request is incorrect, operations are not the same +2. Cancel the active operation +3. Try to find another candidate to delete and run the deletion process for that node + +## Definition of Done + +- [x] Fix flaky node scenario and introduce related debug handlers +- [ ] Node deletion intent is persistent - a node will be eventually deleted after a deletion +request regardless of draining/filling requests and restarts +- [ ] Node deletion can be graceful - deletion completes only after moving all tenant shards to +recommended locations +- [ ] Deploying does not break due to long deletions - drain/fill operations override deletion +process and deletion resumes after drain/fill completes +- [ ] `force` flag is implemented and provides fast, failure-tolerant node removal (e.g., when a +pageserver node does not respond) +- [ ] Legacy delete handler code is removed from storage_controller, test_runner, and storcon_cli From edd60730c89433dd8b6990bdaf9fdad34213c411 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 31 Jul 2025 13:29:25 +0400 Subject: [PATCH 32/34] safekeeper: use last_log_term in mconf switch + choose most advanced sk in pull timeline (#12778) ## Problem I discovered two bugs corresponding to safekeeper migration, which together might lead to a data loss during the migration. The second bug is from a hadron patch and might lead to a data loss during the safekeeper restore in hadron as well. 1. `switch_membership` returns the current `term` instead of `last_log_term`. It is used to choose the `sync_position` in the algorithm, so we might choose the wrong one and break the correctness guarantees. 2. The current `term` is used to choose the most advanced SK in `pull_timeline` with higher priority than `flush_lsn`. It is incorrect because the most advanced safekeeper is the one with the highest `(last_log_term, flush_lsn)` pair. The compute might bump term on the least advanced sk, making it the best choice to pull from, and thus making committed log entries "uncommitted" after `pull_timeline` Part of https://databricks.atlassian.net/browse/LKB-1017 ## Summary of changes - Return `last_log_term` in `switch_membership` - Use `(last_log_term, flush_lsn)` as a primary key for choosing the most advanced sk in `pull_timeline` and deny pulling if the `max_term` is higher than on the most advanced sk (hadron only) - Write tests for both cases - Retry `sync_safekeepers` in `compute_ctl` - Take into the account the quorum size when calculating `sync_position` --- compute_tools/src/compute.rs | 40 +++- safekeeper/src/pull_timeline.rs | 24 ++- safekeeper/src/timeline.rs | 6 +- .../src/service/safekeeper_service.rs | 42 ++++- test_runner/fixtures/neon_fixtures.py | 1 + .../regress/test_safekeeper_migration.py | 174 ++++++++++++++++++ 6 files changed, 275 insertions(+), 12 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 5b1cdb9805..1df837e1e6 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -35,6 +35,9 @@ use tokio::{spawn, sync::watch, task::JoinHandle, time}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, instrument, warn}; use url::Url; +use utils::backoff::{ + DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff_duration, +}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::measured_stream::MeasuredReader; @@ -1557,6 +1560,41 @@ impl ComputeNode { Ok(lsn) } + fn sync_safekeepers_with_retries(&self, storage_auth_token: Option) -> Result { + let max_retries = 5; + let mut attempts = 0; + loop { + let result = self.sync_safekeepers(storage_auth_token.clone()); + match &result { + Ok(_) => { + if attempts > 0 { + tracing::info!("sync_safekeepers succeeded after {attempts} retries"); + } + return result; + } + Err(e) if attempts < max_retries => { + tracing::info!( + "sync_safekeepers failed, will retry (attempt {attempts}): {e:#}" + ); + } + Err(err) => { + tracing::warn!( + "sync_safekeepers still failed after {attempts} retries, giving up: {err:?}" + ); + return result; + } + } + // sleep and retry + let backoff = exponential_backoff_duration( + attempts, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ); + std::thread::sleep(backoff); + attempts += 1; + } + } + /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. #[instrument(skip_all)] @@ -1592,7 +1630,7 @@ impl ComputeNode { lsn } else { info!("starting safekeepers syncing"); - self.sync_safekeepers(pspec.storage_auth_token.clone()) + self.sync_safekeepers_with_retries(pspec.storage_auth_token.clone()) .with_context(|| "failed to sync safekeepers")? }; info!("safekeepers synced at LSN {}", lsn); diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 43232db950..4febc7656e 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -612,19 +612,25 @@ pub async fn handle_request( } } + let max_term = statuses + .iter() + .map(|(status, _)| status.acceptor_state.term) + .max() + .unwrap(); + // Find the most advanced safekeeper let (status, i) = statuses .into_iter() .max_by_key(|(status, _)| { ( status.acceptor_state.epoch, + status.flush_lsn, /* BEGIN_HADRON */ // We need to pull from the SK with the highest term. // This is because another compute may come online and vote the same highest term again on the other two SKs. // Then, there will be 2 computes running on the same term. status.acceptor_state.term, /* END_HADRON */ - status.flush_lsn, status.commit_lsn, ) }) @@ -634,6 +640,22 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); + // TODO(diko): This is hadron only check to make sure that we pull the timeline + // from the safekeeper with the highest term during timeline restore. + // We could avoid returning the error by calling bump_term after pull_timeline. + // However, this is not a big deal because we retry the pull_timeline requests. + // The check should be removed together with removing custom hadron logic for + // safekeeper restore. + if wait_for_peer_timeline_status && status.acceptor_state.term != max_term { + return Err(ApiError::PreconditionFailed( + format!( + "choosen safekeeper {} has term {}, but the most advanced term is {}", + safekeeper_host, status.acceptor_state.term, max_term + ) + .into(), + )); + } + match pull_timeline( status, safekeeper_host, diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index e083a49428..25ac8e5bd3 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -195,12 +195,14 @@ impl StateSK { to: Configuration, ) -> Result { let result = self.state_mut().membership_switch(to).await?; + let flush_lsn = self.flush_lsn(); + let last_log_term = self.state().acceptor_state.get_last_log_term(flush_lsn); Ok(TimelineMembershipSwitchResponse { previous_conf: result.previous_conf, current_conf: result.current_conf, - last_log_term: self.state().acceptor_state.term, - flush_lsn: self.flush_lsn(), + last_log_term, + flush_lsn, }) } diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index a60ebb85c6..fab1342d5d 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -24,12 +24,12 @@ use pageserver_api::controller_api::{ }; use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::PgVersionId; +use safekeeper_api::Term; use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration}; use safekeeper_api::models::{ PullTimelineRequest, TimelineLocateResponse, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse, }; -use safekeeper_api::{INITIAL_TERM, Term}; use safekeeper_client::mgmt_api; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; @@ -1298,13 +1298,7 @@ impl Service { ) .await?; - let mut sync_position = (INITIAL_TERM, Lsn::INVALID); - for res in results.into_iter().flatten() { - let sk_position = (res.last_log_term, res.flush_lsn); - if sync_position < sk_position { - sync_position = sk_position; - } - } + let sync_position = Self::get_sync_position(&results)?; tracing::info!( %generation, @@ -1598,4 +1592,36 @@ impl Service { Ok(()) } + + /// Get membership switch responses from all safekeepers and return the sync position. + /// + /// Sync position is a position equal or greater than the commit position. + /// It is guaranteed that all WAL entries with (last_log_term, flush_lsn) + /// greater than the sync position are not committed (= not on a quorum). + /// + /// Returns error if there is no quorum of successful responses. + fn get_sync_position( + responses: &[mgmt_api::Result], + ) -> Result<(Term, Lsn), ApiError> { + let quorum_size = responses.len() / 2 + 1; + + let mut wal_positions = responses + .iter() + .flatten() + .map(|res| (res.last_log_term, res.flush_lsn)) + .collect::>(); + + // Should be already checked if the responses are from tenant_timeline_set_membership_quorum. + if wal_positions.len() < quorum_size { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "not enough successful responses to get sync position: {}/{}", + wal_positions.len(), + quorum_size, + ))); + } + + wal_positions.sort(); + + Ok(wal_positions[quorum_size - 1]) + } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 11d54f7831..c3dfc78218 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2313,6 +2313,7 @@ class NeonStorageController(MetricsGetter, LogUtils): timeline_id: TimelineId, new_sk_set: list[int], ): + log.info(f"migrate_safekeepers({tenant_id}, {timeline_id}, {new_sk_set})") response = self.request( "POST", f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate", diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py index 2ceeea37a5..97a6ece446 100644 --- a/test_runner/regress/test_safekeeper_migration.py +++ b/test_runner/regress/test_safekeeper_migration.py @@ -286,3 +286,177 @@ def test_sk_generation_aware_tombstones(neon_env_builder: NeonEnvBuilder): assert re.match(r".*Timeline .* deleted.*", exc.value.response.text) # The timeline should remain deleted. expect_deleted(second_sk) + + +def test_safekeeper_migration_stale_timeline(neon_env_builder: NeonEnvBuilder): + """ + Test that safekeeper migration handles stale timeline correctly by migrating to + a safekeeper with a stale timeline. + 1. Check that we are waiting for the stale timeline to catch up with the commit lsn. + The migration might fail if there is no compute to advance the WAL. + 2. Check that we rely on last_log_term (and not the current term) when waiting for the + sync_position on step 7. + 3. Check that migration succeeds if the compute is running. + """ + neon_env_builder.num_safekeepers = 2 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 1, + } + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS) + env.storage_controller.allowed_errors.append(".*not enough successful .* to reach quorum.*") + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + + active_sk = env.get_safekeeper(mconf["sk_set"][0]) + other_sk = [sk for sk in env.safekeepers if sk.id != active_sk.id][0] + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + ep.start(safekeeper_generation=1, safekeepers=[active_sk.id]) + ep.safe_psql("CREATE TABLE t(a int)") + ep.safe_psql("INSERT INTO t VALUES (0)") + + # Pull the timeline to other_sk, so other_sk now has a "stale" timeline on it. + other_sk.pull_timeline([active_sk], env.initial_tenant, env.initial_timeline) + + # Advance the WAL on active_sk. + ep.safe_psql("INSERT INTO t VALUES (1)") + + # The test is more tricky if we have the same last_log_term but different term/flush_lsn. + # Stop the active_sk during the endpoint shutdown because otherwise compute_ctl runs + # sync_safekeepers and advances last_log_term on active_sk. + active_sk.stop() + ep.stop(mode="immediate") + active_sk.start() + + active_sk_status = active_sk.http_client().timeline_status( + env.initial_tenant, env.initial_timeline + ) + other_sk_status = other_sk.http_client().timeline_status( + env.initial_tenant, env.initial_timeline + ) + + # other_sk should have the same last_log_term, but a stale flush_lsn. + assert active_sk_status.last_log_term == other_sk_status.last_log_term + assert active_sk_status.flush_lsn > other_sk_status.flush_lsn + + commit_lsn = active_sk_status.flush_lsn + + # Bump the term on other_sk to make it higher than active_sk. + # This is to make sure we don't use current term instead of last_log_term in the algorithm. + other_sk.http_client().term_bump( + env.initial_tenant, env.initial_timeline, active_sk_status.term + 100 + ) + + # TODO(diko): now it fails because the timeline on other_sk is stale and there is no compute + # to catch up it with active_sk. It might be fixed in https://databricks.atlassian.net/browse/LKB-946 + # if we delete stale timelines before starting the migration. + # But the rest of the test is still valid: we should not lose committed WAL after the migration. + with pytest.raises( + StorageControllerApiException, match="not enough successful .* to reach quorum" + ): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [other_sk.id] + ) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["new_sk_set"] == [other_sk.id] + assert mconf["sk_set"] == [active_sk.id] + assert mconf["generation"] == 2 + + # Start the endpoint, so it advances the WAL on other_sk. + ep.start(safekeeper_generation=2, safekeepers=[active_sk.id, other_sk.id]) + # Now the migration should succeed. + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [other_sk.id] + ) + + # Check that we didn't lose committed WAL. + assert ( + other_sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline).flush_lsn + >= commit_lsn + ) + assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)] + + +def test_pull_from_most_advanced_sk(neon_env_builder: NeonEnvBuilder): + """ + Test that we pull the timeline from the most advanced safekeeper during the + migration and do not lose committed WAL. + """ + neon_env_builder.num_safekeepers = 4 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 3, + } + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + + sk_set = mconf["sk_set"] + assert len(sk_set) == 3 + + other_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0] + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + ep.start(safekeeper_generation=1, safekeepers=sk_set) + ep.safe_psql("CREATE TABLE t(a int)") + ep.safe_psql("INSERT INTO t VALUES (0)") + + # Stop one sk, so we have a lagging WAL on it. + env.get_safekeeper(sk_set[0]).stop() + # Advance the WAL on the other sks. + ep.safe_psql("INSERT INTO t VALUES (1)") + + # Stop other sks to make sure compute_ctl doesn't advance the last_log_term on them during shutdown. + for sk_id in sk_set[1:]: + env.get_safekeeper(sk_id).stop() + ep.stop(mode="immediate") + for sk_id in sk_set: + env.get_safekeeper(sk_id).start() + + # Bump the term on the lagging sk to make sure we don't use it to choose the most advanced sk. + env.get_safekeeper(sk_set[0]).http_client().term_bump( + env.initial_tenant, env.initial_timeline, 100 + ) + + def get_commit_lsn(sk_set: list[int]): + flush_lsns = [] + last_log_terms = [] + for sk_id in sk_set: + sk = env.get_safekeeper(sk_id) + status = sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline) + flush_lsns.append(status.flush_lsn) + last_log_terms.append(status.last_log_term) + + # In this test we assume that all sks have the same last_log_term. + assert len(set(last_log_terms)) == 1 + + flush_lsns.sort(reverse=True) + commit_lsn = flush_lsns[len(sk_set) // 2] + + log.info(f"sk_set: {sk_set}, flush_lsns: {flush_lsns}, commit_lsn: {commit_lsn}") + return commit_lsn + + commit_lsn_before_migration = get_commit_lsn(sk_set) + + # Make two migrations, so the lagging sk stays in the sk_set, but other sks are replaced. + new_sk_set1 = [sk_set[0], sk_set[1], other_sk] # remove sk_set[2], add other_sk + new_sk_set2 = [sk_set[0], other_sk, sk_set[2]] # remove sk_set[1], add sk_set[2] back + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, new_sk_set1 + ) + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, new_sk_set2 + ) + + commit_lsn_after_migration = get_commit_lsn(new_sk_set2) + + # We should not lose committed WAL. + # If we have choosen the lagging sk to pull the timeline from, this might fail. + assert commit_lsn_before_migration <= commit_lsn_after_migration + + ep.start(safekeeper_generation=5, safekeepers=new_sk_set2) + assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)] From f3ee6e818de91cde533199d56426dda4b1d16da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Szafra=C5=84ski?= Date: Thu, 31 Jul 2025 11:53:48 +0200 Subject: [PATCH 33/34] [proxy] Correctly classify ConnectErrors (#12793) As is, e.g. quota errors on wake compute are logged as "compute" errors. --- proxy/src/serverless/backend.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 511bdc4e42..5b356c8460 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -458,7 +458,7 @@ pub(crate) enum LocalProxyConnError { impl ReportableError for HttpConnError { fn get_error_kind(&self) -> ErrorKind { match self { - HttpConnError::ConnectError(_) => ErrorKind::Compute, + HttpConnError::ConnectError(e) => e.get_error_kind(), HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, HttpConnError::PostgresConnectionError(p) => match p.as_db_error() { // user provided a wrong database name From 8fe75961206877811c337f448ee9f0076c68598d Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 31 Jul 2025 12:11:30 +0200 Subject: [PATCH 34/34] chore(compute_tools): Delete unused anon_ext_fn_reassign.sql (#12787) It's an anon v1 failed launch artifact, I suppose. --- compute_tools/src/sql/anon_ext_fn_reassign.sql | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 compute_tools/src/sql/anon_ext_fn_reassign.sql diff --git a/compute_tools/src/sql/anon_ext_fn_reassign.sql b/compute_tools/src/sql/anon_ext_fn_reassign.sql deleted file mode 100644 index 714a6db1e9..0000000000 --- a/compute_tools/src/sql/anon_ext_fn_reassign.sql +++ /dev/null @@ -1,13 +0,0 @@ -DO $$ -DECLARE - query varchar; -BEGIN - FOR query IN - SELECT pg_catalog.format('ALTER FUNCTION %I(%s) OWNER TO {db_owner};', p.oid::regproc, pg_catalog.pg_get_function_identity_arguments(p.oid)) - FROM pg_catalog.pg_proc p - WHERE p.pronamespace OPERATOR(pg_catalog.=) 'anon'::regnamespace::oid - LOOP - EXECUTE query; - END LOOP; -END -$$;