diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 3142a36fa0..25b2fc702a 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -31,6 +31,7 @@ config-variables: - NEON_PROD_AWS_ACCOUNT_ID - PGREGRESS_PG16_PROJECT_ID - PGREGRESS_PG17_PROJECT_ID + - PREWARM_PGBENCH_SIZE - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_CICD_CHANNEL_ID diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 79371ec704..df80bad579 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -219,6 +219,7 @@ jobs: --ignore test_runner/performance/test_cumulative_statistics_persistence.py --ignore test_runner/performance/test_perf_many_relations.py --ignore test_runner/performance/test_perf_oltp_large_tenant.py + --ignore test_runner/performance/test_lfc_prewarm.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -410,6 +411,77 @@ jobs: env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + prewarm-test: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials + env: + PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }} + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 17 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: "neon-staging" + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: ghcr.io/neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --init + + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Run prewarm benchmark + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_lfc_prewarm.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 5400 + pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + id: create-allure-report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + generate-matrices: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) diff --git a/.gitmodules b/.gitmodules index d1330bf28c..e381fb079e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,16 +1,16 @@ [submodule "vendor/postgres-v14"] path = vendor/postgres-v14 - url = https://github.com/neondatabase/postgres.git + url = ../postgres.git branch = REL_14_STABLE_neon [submodule "vendor/postgres-v15"] path = vendor/postgres-v15 - url = https://github.com/neondatabase/postgres.git + url = ../postgres.git branch = REL_15_STABLE_neon [submodule "vendor/postgres-v16"] path = vendor/postgres-v16 - url = https://github.com/neondatabase/postgres.git + url = ../postgres.git branch = REL_16_STABLE_neon [submodule "vendor/postgres-v17"] path = vendor/postgres-v17 - url = https://github.com/neondatabase/postgres.git + url = ../postgres.git branch = REL_17_STABLE_neon diff --git a/Cargo.lock b/Cargo.lock index caed814d5f..2f36790d30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4294,7 +4294,9 @@ dependencies = [ "humantime-serde", "pageserver_api", "pageserver_client", + "pageserver_client_grpc", "pageserver_page_api", + "pprof", "rand 0.8.5", "reqwest", "serde", @@ -4323,6 +4325,7 @@ dependencies = [ "pageserver_api", "postgres_ffi", "remote_storage", + "serde", "serde_json", "svg_fmt", "thiserror 1.0.69", @@ -4499,6 +4502,7 @@ name = "pageserver_client_grpc" version = "0.1.0" dependencies = [ "anyhow", + "arc-swap", "bytes", "compute_api", "futures", @@ -4506,6 +4510,7 @@ dependencies = [ "pageserver_page_api", "tokio", "tokio-stream", + "tokio-util", "tonic 0.13.1", "tracing", "utils", @@ -5285,6 +5290,7 @@ dependencies = [ "async-trait", "atomic-take", "aws-config", + "aws-credential-types", "aws-sdk-iam", "aws-sigv4", "base64 0.22.1", @@ -5324,6 +5330,7 @@ dependencies = [ "itoa", "jose-jwa", "jose-jwk", + "json", "lasso", "measured", "metrics", @@ -6987,6 +6994,7 @@ dependencies = [ "pageserver_api", "pageserver_client", "reqwest", + "safekeeper_api", "serde_json", "storage_controller_client", "tokio", @@ -7556,6 +7564,7 @@ dependencies = [ "futures-core", "pin-project-lite", "tokio", + "tokio-util", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 14f2cfcb56..df2064a4a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -201,7 +201,7 @@ tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.g tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.12.0" tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]} -tokio-stream = "0.1" +tokio-stream = { version = "0.1", features = ["sync"] } tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] } toml = "0.8" @@ -262,6 +262,7 @@ neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" } pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } +pageserver_client_grpc = { path = "./pageserver/client_grpc" } pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } pageserver_page_api = { path = "./pageserver/page_api" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } diff --git a/compute_tools/README.md b/compute_tools/README.md index 8d84031efc..49f1368f0e 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -46,11 +46,14 @@ stateDiagram-v2 Configuration --> Failed : Failed to configure the compute Configuration --> Running : Compute has been configured Empty --> Init : Compute spec is immediately available - Empty --> TerminationPending : Requested termination + Empty --> TerminationPendingFast : Requested termination + Empty --> TerminationPendingImmediate : Requested termination Init --> Failed : Failed to start Postgres Init --> Running : Started Postgres - Running --> TerminationPending : Requested termination - TerminationPending --> Terminated : Terminated compute + Running --> TerminationPendingFast : Requested termination + Running --> TerminationPendingImmediate : Requested termination + TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status + TerminationPendingImmediate --> Terminated : Terminated compute immediately Failed --> [*] : Compute exited Terminated --> [*] : Compute exited ``` diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 0496d38e67..8f42cf699b 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -956,14 +956,20 @@ impl ComputeNode { None }; - let mut delay_exit = false; let mut state = self.state.lock().unwrap(); state.terminate_flush_lsn = lsn; - if let ComputeStatus::TerminationPending { mode } = state.status { + + let delay_exit = state.status == ComputeStatus::TerminationPendingFast; + if state.status == ComputeStatus::TerminationPendingFast + || state.status == ComputeStatus::TerminationPendingImmediate + { + info!( + "Changing compute status from {} to {}", + state.status, + ComputeStatus::Terminated + ); state.status = ComputeStatus::Terminated; self.state_changed.notify_all(); - // we were asked to terminate gracefully, don't exit to avoid restart - delay_exit = mode == compute_api::responses::TerminateMode::Fast } drop(state); @@ -1034,6 +1040,8 @@ impl ComputeNode { PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?, }; + self.fix_zenith_signal_neon_signal()?; + let mut state = self.state.lock().unwrap(); state.metrics.pageserver_connect_micros = connected.duration_since(started).as_micros() as u64; @@ -1043,6 +1051,27 @@ impl ComputeNode { Ok(()) } + /// Move the Zenith signal file to Neon signal file location. + /// This makes Compute compatible with older PageServers that don't yet + /// know about the Zenith->Neon rename. + fn fix_zenith_signal_neon_signal(&self) -> Result<()> { + let datadir = Path::new(&self.params.pgdata); + + let neonsig = datadir.join("neon.signal"); + + if neonsig.is_file() { + return Ok(()); + } + + let zenithsig = datadir.join("zenith.signal"); + + if zenithsig.is_file() { + fs::copy(zenithsig, neonsig)?; + } + + Ok(()) + } + /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when /// the connection was established, and the (compressed) size of the basebackup. fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> { @@ -1805,6 +1834,8 @@ impl ComputeNode { tls_config, )?; + self.pg_reload_conf()?; + if !spec.skip_pg_catalog_updates { let max_concurrent_connections = spec.reconfigure_concurrency; // Temporarily reset max_cluster_size in config @@ -1824,10 +1855,9 @@ impl ComputeNode { Ok(()) })?; + self.pg_reload_conf()?; } - self.pg_reload_conf()?; - let unknown_op = "unknown".to_string(); let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op); info!( @@ -1900,7 +1930,8 @@ impl ComputeNode { // exit loop ComputeStatus::Failed - | ComputeStatus::TerminationPending { .. } + | ComputeStatus::TerminationPendingFast + | ComputeStatus::TerminationPendingImmediate | ComputeStatus::Terminated => break 'cert_update, // wait @@ -2456,7 +2487,7 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> { serde_json::to_string(&extensions).expect("failed to serialize extensions list") ); } - Err(err) => error!("could not get installed extensions: {err:?}"), + Err(err) => error!("could not get installed extensions: {err}"), } Ok(()) } diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs index 3f6f9a7ecc..d014a5bb72 100644 --- a/compute_tools/src/compute_prewarm.rs +++ b/compute_tools/src/compute_prewarm.rs @@ -70,7 +70,7 @@ impl ComputeNode { } }; let row = match client - .query_one("select * from get_prewarm_info()", &[]) + .query_one("select * from neon.get_prewarm_info()", &[]) .await { Ok(row) => row, @@ -146,7 +146,7 @@ impl ComputeNode { ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await .context("connecting to postgres")? - .query_one("select prewarm_local_cache($1)", &[&uncompressed]) + .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed]) .await .context("loading LFC state into postgres") .map(|_| ()) @@ -196,7 +196,7 @@ impl ComputeNode { ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await .context("connecting to postgres")? - .query_one("select get_local_cache_state()", &[]) + .query_one("select neon.get_local_cache_state()", &[]) .await .context("querying LFC state")? .try_get::(0) diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 3c58b284b3..93a357e160 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -371,9 +371,28 @@ paths: summary: Terminate Postgres and wait for it to exit description: "" operationId: terminate + parameters: + - name: mode + in: query + description: "Terminate mode: fast (wait 30s before returning) and immediate" + required: false + schema: + type: string + enum: ["fast", "immediate"] + default: fast responses: 200: description: Result + content: + application/json: + schema: + $ref: "#/components/schemas/TerminateResponse" + 201: + description: Result if compute is already terminated + content: + application/json: + schema: + $ref: "#/components/schemas/TerminateResponse" 412: description: "wrong state" content: @@ -530,11 +549,14 @@ components: type: string enum: - empty - - init - - failed - - running - configuration_pending + - init + - running - configuration + - failed + - termination_pending_fast + - termination_pending_immediate + - terminated example: running ExtensionInstallRequest: @@ -660,6 +682,17 @@ components: description: Role name. example: "neon" + TerminateResponse: + type: object + required: + - lsn + properties: + lsn: + type: string + nullable: true + description: "last WAL flush LSN" + example: "0/028F10D8" + SetRoleGrantsResponse: type: object required: diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs index 32d90a5990..5b30b020c8 100644 --- a/compute_tools/src/http/routes/terminate.rs +++ b/compute_tools/src/http/routes/terminate.rs @@ -3,7 +3,7 @@ use crate::http::JsonResponse; use axum::extract::State; use axum::response::Response; use axum_extra::extract::OptionalQuery; -use compute_api::responses::{ComputeStatus, TerminateResponse}; +use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse}; use http::StatusCode; use serde::Deserialize; use std::sync::Arc; @@ -12,7 +12,7 @@ use tracing::info; #[derive(Deserialize, Default)] pub struct TerminateQuery { - mode: compute_api::responses::TerminateMode, + mode: TerminateMode, } /// Terminate the compute. @@ -24,16 +24,16 @@ pub(in crate::http) async fn terminate( { let mut state = compute.state.lock().unwrap(); if state.status == ComputeStatus::Terminated { - return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn); + let response = TerminateResponse { + lsn: state.terminate_flush_lsn, + }; + return JsonResponse::success(StatusCode::CREATED, response); } if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) { return JsonResponse::invalid_status(state.status); } - state.set_status( - ComputeStatus::TerminationPending { mode }, - &compute.state_changed, - ); + state.set_status(mode.into(), &compute.state_changed); } forward_termination_signal(false); diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 411e03b7ec..90e1a17be4 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use anyhow::Result; use compute_api::responses::{InstalledExtension, InstalledExtensions}; +use tokio_postgres::error::Error as PostgresError; use tokio_postgres::{Client, Config, NoTls}; use crate::metrics::INSTALLED_EXTENSIONS; @@ -10,7 +11,7 @@ use crate::metrics::INSTALLED_EXTENSIONS; /// and to make database listing query here more explicit. /// /// Limit the number of databases to 500 to avoid excessive load. -async fn list_dbs(client: &mut Client) -> Result> { +async fn list_dbs(client: &mut Client) -> Result, PostgresError> { // `pg_database.datconnlimit = -2` means that the database is in the // invalid state let databases = client @@ -37,7 +38,9 @@ async fn list_dbs(client: &mut Client) -> Result> { /// Same extension can be installed in multiple databases with different versions, /// so we report a separate metric (number of databases where it is installed) /// for each extension version. -pub async fn get_installed_extensions(mut conf: Config) -> Result { +pub async fn get_installed_extensions( + mut conf: Config, +) -> Result { conf.application_name("compute_ctl:get_installed_extensions"); let databases: Vec = { let (mut client, connection) = conf.connect(NoTls).await?; diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index 91dedbb42a..6e4df73c0f 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -108,7 +108,7 @@ pub(crate) static LFC_PREWARMS: Lazy = Lazy::new(|| { pub(crate) static LFC_PREWARM_ERRORS: Lazy = Lazy::new(|| { register_int_counter!( "compute_ctl_lfc_prewarm_errors_total", - "Total number of LFC prewarms errors requested by compute_ctl or autoprewarm option", + "Total number of LFC prewarm errors", ) .expect("failed to define a metric") }); @@ -124,7 +124,7 @@ pub(crate) static LFC_OFFLOADS: Lazy = Lazy::new(|| { pub(crate) static LFC_OFFLOAD_ERRORS: Lazy = Lazy::new(|| { register_int_counter!( "compute_ctl_lfc_offload_errors_total", - "Total number of LFC offload errors requested by compute_ctl or lfc_offload_period_seconds option", + "Total number of LFC offload errors", ) .expect("failed to define a metric") }); diff --git a/compute_tools/src/migrations/0002-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql index 6cb49f873f..8fc371eb8f 100644 --- a/compute_tools/src/migrations/0002-alter_roles.sql +++ b/compute_tools/src/migrations/0002-alter_roles.sql @@ -1,3 +1,16 @@ +-- On December 8th, 2023, an engineering escalation (INC-110) was opened after +-- it was found that BYPASSRLS was being applied to all roles. +-- +-- PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657 +-- Subsequent commit on main: https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072 +-- +-- NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it +-- isn't easy to know if a Postgres cluster is affected by the issue, we need to +-- keep the migration around for a long time, if not indefinitely, so any +-- cluster can be fixed. +-- +-- Branching is the gift that keeps on giving... + DO $$ DECLARE role_name text; diff --git a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql new file mode 100644 index 0000000000..36e31544be --- /dev/null +++ b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql @@ -0,0 +1 @@ +GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION; diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql index deb7a364af..3464a2b1cf 100644 --- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql +++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql @@ -6,14 +6,18 @@ BEGIN admin_option AS admin INTO monitor FROM pg_auth_members - WHERE roleid = 'neon_superuser'::regrole - AND member = 'pg_monitor'::regrole; + WHERE roleid = 'pg_monitor'::regrole + AND member = 'neon_superuser'::regrole; - IF NOT monitor.member THEN + IF monitor IS NULL THEN + RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor'; + END IF; + + IF monitor.admin IS NULL OR NOT monitor.member THEN RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor'; END IF; - IF NOT monitor.admin THEN + IF monitor.admin IS NULL OR NOT monitor.admin THEN RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor'; END IF; END $$; diff --git a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql new file mode 100644 index 0000000000..e62b742d30 --- /dev/null +++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql @@ -0,0 +1,23 @@ +DO $$ +DECLARE + signal_backend record; +BEGIN + SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member, + admin_option AS admin + INTO signal_backend + FROM pg_auth_members + WHERE roleid = 'pg_signal_backend'::regrole + AND member = 'neon_superuser'::regrole; + + IF signal_backend IS NULL THEN + RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend'; + END IF; + + IF signal_backend.member IS NULL OR NOT signal_backend.member THEN + RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend'; + END IF; + + IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN + RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend'; + END IF; +END $$; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 8a2f6addad..fa01545856 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -84,7 +84,8 @@ impl ComputeMonitor { if matches!( compute_status, ComputeStatus::Terminated - | ComputeStatus::TerminationPending { .. } + | ComputeStatus::TerminationPendingFast + | ComputeStatus::TerminationPendingImmediate | ComputeStatus::Failed ) { info!( diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 43cfbb48f7..b6382b2f56 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -197,6 +197,7 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> { include_str!( "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql" ), + include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"), ]; MigrationRunner::new(client, &migrations) diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index f43f459636..988b08e875 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -36,7 +36,7 @@ impl StorageBroker { pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> { let broker = &self.env.broker; - print!("Starting neon broker at {}", broker.client_url()); + println!("Starting neon broker at {}", broker.client_url()); let mut args = Vec::new(); diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 74ab15dc97..91a62b0ca4 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -32,7 +32,8 @@ //! config.json - passed to `compute_ctl` //! pgdata/ //! postgresql.conf - copy of postgresql.conf created by `compute_ctl` -//! zenith.signal +//! neon.signal +//! zenith.signal - copy of neon.signal, for backward compatibility //! //! ``` //! @@ -922,7 +923,8 @@ impl Endpoint { ComputeStatus::Empty | ComputeStatus::ConfigurationPending | ComputeStatus::Configuration - | ComputeStatus::TerminationPending { .. } + | ComputeStatus::TerminationPendingFast + | ComputeStatus::TerminationPendingImmediate | ComputeStatus::Terminated => { bail!("unexpected compute status: {:?}", state.status) } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index d0611113e8..d34dd39f61 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -217,6 +217,9 @@ pub struct NeonStorageControllerConf { pub posthog_config: Option, pub kick_secondary_downloads: Option, + + #[serde(with = "humantime_serde")] + pub shard_split_request_timeout: Option, } impl NeonStorageControllerConf { @@ -250,6 +253,7 @@ impl Default for NeonStorageControllerConf { timeline_safekeeper_count: None, posthog_config: None, kick_secondary_downloads: None, + shard_split_request_timeout: None, } } } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 3f66960edd..843ead807d 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -303,7 +303,7 @@ impl PageServerNode { async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> { // TODO: using a thread here because start_process() is not async but we need to call check_status() let datadir = self.repo_path(); - print!( + println!( "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}", self.conf.id, self.pg_connection_config.raw_address(), @@ -452,6 +452,12 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_threshold' as non zero integer")?, + // HADRON + image_layer_force_creation_period: settings + .remove("image_layer_force_creation_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'image_layer_force_creation_period' as duration")?, image_layer_creation_check_threshold: settings .remove("image_layer_creation_check_threshold") .map(|x| x.parse::()) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index da9dafd8e9..2ba2f3ebe4 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -127,7 +127,7 @@ impl SafekeeperNode { extra_opts: &[String], retry_timeout: &Duration, ) -> anyhow::Result<()> { - print!( + println!( "Starting safekeeper at '{}' in '{}', retrying for {:?}", self.pg_connection_config.raw_address(), self.datadir_path().display(), diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index bb83a6319c..f996f39967 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -648,6 +648,13 @@ impl StorageController { args.push(format!("--timeline-safekeeper-count={sk_cnt}")); } + if let Some(duration) = self.config.shard_split_request_timeout { + args.push(format!( + "--shard-split-request-timeout={}", + humantime::Duration::from(duration) + )); + } + let mut envs = vec![ ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), @@ -660,7 +667,7 @@ impl StorageController { )); } - println!("Starting storage controller"); + println!("Starting storage controller at {scheme}://{host}:{listen_port}"); background_process::start_process( COMMAND, diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml index ce89116691..61d48b2469 100644 --- a/control_plane/storcon_cli/Cargo.toml +++ b/control_plane/storcon_cli/Cargo.toml @@ -14,6 +14,7 @@ humantime.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true reqwest.workspace = true +safekeeper_api.workspace=true serde_json = { workspace = true, features = ["raw_value"] } storage_controller_client.workspace = true tokio.workspace = true diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 701c4b3b2e..fcc5549beb 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -11,7 +11,7 @@ use pageserver_api::controller_api::{ PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, - TenantShardMigrateRequest, TenantShardMigrateResponse, + TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest, }; use pageserver_api::models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig, @@ -21,6 +21,7 @@ use pageserver_api::models::{ use pageserver_api::shard::{ShardStripeSize, TenantShardId}; use pageserver_client::mgmt_api::{self}; use reqwest::{Certificate, Method, StatusCode, Url}; +use safekeeper_api::models::TimelineLocateResponse; use storage_controller_client::control_api::Client; use utils::id::{NodeId, TenantId, TimelineId}; @@ -279,6 +280,23 @@ enum Command { #[arg(long)] concurrency: Option, }, + /// Locate safekeepers for a timeline from the storcon DB. + TimelineLocate { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + timeline_id: TimelineId, + }, + /// Migrate a timeline to a new set of safekeepers + TimelineSafekeeperMigrate { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + timeline_id: TimelineId, + /// Example: --new-sk-set 1,2,3 + #[arg(long, required = true, value_delimiter = ',')] + new_sk_set: Vec, + }, } #[derive(Parser)] @@ -458,6 +476,7 @@ async fn main() -> anyhow::Result<()> { listen_http_port, listen_https_port, availability_zone_id: AvailabilityZone(availability_zone_id), + node_ip_addr: None, }), ) .await?; @@ -1324,7 +1343,7 @@ async fn main() -> anyhow::Result<()> { concurrency, } => { let mut path = format!( - "/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", + "v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", ); if let Some(c) = concurrency { @@ -1335,6 +1354,41 @@ async fn main() -> anyhow::Result<()> { .dispatch::<(), ()>(Method::POST, path, None) .await?; } + Command::TimelineLocate { + tenant_id, + timeline_id, + } => { + let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate"); + + let resp = storcon_client + .dispatch::<(), TimelineLocateResponse>(Method::GET, path, None) + .await?; + + let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::>(); + let new_sk_set = resp + .new_sk_set + .as_ref() + .map(|ids| ids.iter().map(|id| id.0 as i64).collect::>()); + + println!("generation = {}", resp.generation); + println!("sk_set = {sk_set:?}"); + println!("new_sk_set = {new_sk_set:?}"); + } + Command::TimelineSafekeeperMigrate { + tenant_id, + timeline_id, + new_sk_set, + } => { + let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate"); + + storcon_client + .dispatch::<_, ()>( + Method::POST, + path, + Some(TimelineSafekeeperMigrateRequest { new_sk_set }), + ) + .await?; + } } Ok(()) diff --git a/docs/core_changes.md b/docs/core_changes.md index 1388317728..abfd20af26 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -129,9 +129,10 @@ segment to bootstrap the WAL writing, but it doesn't contain the checkpoint reco changes in xlog.c, to allow starting the compute node without reading the last checkpoint record from WAL. -This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start -at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last -checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo. +This includes code to read the `neon.signal` (also `zenith.signal`) file, which tells the startup +code the LSN to start at. When the `neon.signal` file is present, the startup uses that LSN +instead of the last checkpoint's LSN. The system is known to be consistent at that LSN, without +any WAL redo. ### How to get rid of the patch diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index e10c381fb4..2fe233214a 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -121,6 +121,15 @@ pub enum TerminateMode { Immediate, } +impl From for ComputeStatus { + fn from(mode: TerminateMode) -> Self { + match mode { + TerminateMode::Fast => ComputeStatus::TerminationPendingFast, + TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate, + } + } +} + #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum ComputeStatus { @@ -141,7 +150,9 @@ pub enum ComputeStatus { // control-plane to terminate it. Failed, // Termination requested - TerminationPending { mode: TerminateMode }, + TerminationPendingFast, + // Termination requested, without waiting 30s before returning from /terminate + TerminationPendingImmediate, // Terminated Postgres Terminated, } @@ -160,7 +171,10 @@ impl Display for ComputeStatus { ComputeStatus::Running => f.write_str("running"), ComputeStatus::Configuration => f.write_str("configuration"), ComputeStatus::Failed => f.write_str("failed"), - ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"), + ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"), + ComputeStatus::TerminationPendingImmediate => { + f.write_str("termination-pending-immediate") + } ComputeStatus::Terminated => f.write_str("terminated"), } } diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs index f32ced1180..a61bf8e08a 100644 --- a/libs/http-utils/src/endpoint.rs +++ b/libs/http-utils/src/endpoint.rs @@ -20,6 +20,7 @@ use tokio_stream::wrappers::ReceiverStream; use tokio_util::io::ReaderStream; use tracing::{Instrument, debug, info, info_span, warn}; use utils::auth::{AuthError, Claims, SwappableJwtAuth}; +use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS}; use crate::error::{ApiError, api_error_handler, route_error_handler}; use crate::request::{get_query_param, parse_query_param}; @@ -250,9 +251,28 @@ impl std::io::Write for ChannelWriter { } } -pub async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { +pub async fn prometheus_metrics_handler( + req: Request, + force_metric_collection_on_scrape: bool, +) -> Result, ApiError> { SERVE_METRICS_COUNT.inc(); + // HADRON + let requested_use_latest = parse_query_param(&req, "use_latest")?; + + let use_latest = match requested_use_latest { + None => force_metric_collection_on_scrape, + Some(true) => true, + Some(false) => { + if force_metric_collection_on_scrape { + // We don't cache in this case + true + } else { + false + } + } + }; + let started_at = std::time::Instant::now(); let (tx, rx) = mpsc::channel(1); @@ -277,12 +297,18 @@ pub async fn prometheus_metrics_handler(_req: Request) -> Result) -> Result { tracing::info!( @@ -303,6 +333,7 @@ pub async fn prometheus_metrics_handler(_req: Request) -> Result, + #[serde(skip_serializing_if = "Option::is_none")] + pub image_layer_generation_large_timeline_threshold: Option, + pub force_metric_collection_on_scrape: bool, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -560,6 +564,11 @@ pub struct TenantConfigToml { pub gc_period: Duration, // Delta layer churn threshold to create L1 image layers. pub image_creation_threshold: usize, + // HADRON + // When the timeout is reached, PageServer will (1) force compact any remaining L0 deltas and + // (2) create image layers if there are any L1 deltas. + #[serde(with = "humantime_serde")] + pub image_layer_force_creation_period: Option, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is time. @@ -758,6 +767,7 @@ impl Default for ConfigToml { disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(), test_remote_failures: (0), + test_remote_failures_probability: (100), ondemand_download_behavior_treat_error_as_warn: (false), @@ -821,6 +831,8 @@ impl Default for ConfigToml { }, basebackup_cache_config: None, posthog_config: None, + image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024), + force_metric_collection_on_scrape: true, } } } @@ -914,6 +926,7 @@ impl Default for TenantConfigToml { gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period"), image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, + image_layer_force_creation_period: None, pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) .expect("cannot parse default PITR interval"), walreceiver_connect_timeout: humantime::parse_duration( diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index a8c7083b17..8f86b03f72 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -1,5 +1,6 @@ use std::collections::{HashMap, HashSet}; use std::fmt::Display; +use std::net::IpAddr; use std::str::FromStr; use std::time::{Duration, Instant}; @@ -10,7 +11,7 @@ use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; -use crate::models::{PageserverUtilization, ShardParameters, TenantConfig}; +use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo}; use crate::shard::{ShardStripeSize, TenantShardId}; #[derive(Serialize, Deserialize, Debug)] @@ -60,6 +61,11 @@ pub struct NodeRegisterRequest { pub listen_https_port: Option, pub availability_zone_id: AvailabilityZone, + + // Reachable IP address of the PS/SK registering, if known. + // Hadron Cluster Coordiantor will update the DNS record of the registering node + // with this IP address. + pub node_ip_addr: Option, } #[derive(Serialize, Deserialize)] @@ -126,6 +132,13 @@ pub struct TenantDescribeResponse { pub config: TenantConfig, } +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantTimelineDescribeResponse { + pub shards: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub image_consistent_lsn: Option, +} + #[derive(Serialize, Deserialize, Debug)] pub struct NodeShardResponse { pub node_id: NodeId, @@ -538,6 +551,39 @@ pub struct SafekeeperDescribeResponse { pub scheduling_policy: SkSchedulingPolicy, } +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct TimelineSafekeeperPeer { + pub node_id: NodeId, + pub listen_http_addr: String, + pub http_port: i32, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct SCSafekeeperTimeline { + // SC does not know the tenant id. + pub timeline_id: TimelineId, + pub peers: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct SCSafekeeperTimelinesResponse { + pub timelines: Vec, + pub safekeeper_peers: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct SafekeeperTimeline { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub peers: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct SafekeeperTimelinesResponse { + pub timelines: Vec, + pub safekeeper_peers: Vec, +} + #[derive(Serialize, Deserialize, Clone)] pub struct SafekeeperSchedulingPolicyRequest { pub scheduling_policy: SkSchedulingPolicy, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 6735320484..11e02a8550 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -597,6 +597,9 @@ pub struct TenantConfigPatch { pub gc_period: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub image_creation_threshold: FieldPatch, + // HADRON + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub image_layer_force_creation_period: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub pitr_interval: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] @@ -700,6 +703,11 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub image_creation_threshold: Option, + // HADRON + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] + pub image_layer_force_creation_period: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub pitr_interval: Option, @@ -798,6 +806,7 @@ impl TenantConfig { mut gc_horizon, mut gc_period, mut image_creation_threshold, + mut image_layer_force_creation_period, mut pitr_interval, mut walreceiver_connect_timeout, mut lagging_wal_timeout, @@ -861,6 +870,11 @@ impl TenantConfig { patch .image_creation_threshold .apply(&mut image_creation_threshold); + // HADRON + patch + .image_layer_force_creation_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut image_layer_force_creation_period); patch .pitr_interval .map(|v| humantime::parse_duration(&v))? @@ -942,6 +956,7 @@ impl TenantConfig { gc_horizon, gc_period, image_creation_threshold, + image_layer_force_creation_period, pitr_interval, walreceiver_connect_timeout, lagging_wal_timeout, @@ -1016,6 +1031,9 @@ impl TenantConfig { image_creation_threshold: self .image_creation_threshold .unwrap_or(global_conf.image_creation_threshold), + image_layer_force_creation_period: self + .image_layer_force_creation_period + .or(global_conf.image_layer_force_creation_period), pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval), walreceiver_connect_timeout: self .walreceiver_connect_timeout @@ -1604,6 +1622,9 @@ pub struct TimelineInfo { /// Whether the timeline is invisible in synthetic size calculations. pub is_invisible: Option, + // HADRON: the largest LSN below which all page updates have been included in the image layers. + #[serde(skip_serializing_if = "Option::is_none")] + pub image_consistent_lsn: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 69316fd493..0ae13552b8 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -43,6 +43,7 @@ itertools.workspace = true sync_wrapper = { workspace = true, features = ["futures"] } byteorder = "1.4" +rand = "0.8.5" [dev-dependencies] camino-tempfile.workspace = true diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index ed416b2811..5885c3e791 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -732,9 +732,15 @@ impl GenericRemoteStorage { }) } - pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self { - Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first))) + /* BEGIN_HADRON */ + pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self { + Self::Unreliable(Arc::new(UnreliableWrapper::new( + s, + fail_first, + fail_probability, + ))) } + /* END_HADRON */ /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata. pub async fn upload_storage_object( diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index f9856a5856..e895380192 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -1,6 +1,8 @@ //! This module provides a wrapper around a real RemoteStorage implementation that //! causes the first N attempts at each upload or download operatio to fail. For //! testing purposes. +use rand::Rng; +use std::cmp; use std::collections::HashMap; use std::collections::hash_map::Entry; use std::num::NonZeroU32; @@ -25,6 +27,13 @@ pub struct UnreliableWrapper { // Tracks how many failed attempts of each operation has been made. attempts: Mutex>, + + /* BEGIN_HADRON */ + // This the probability of failure for each operation, ranged from [0, 100]. + // The probability is default to 100, which means that all operations will fail. + // Storage will fail by probability up to attempts_to_fail times. + attempt_failure_probability: u64, + /* END_HADRON */ } /// Used to identify retries of different unique operation. @@ -40,7 +49,11 @@ enum RemoteOp { } impl UnreliableWrapper { - pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self { + pub fn new( + inner: crate::GenericRemoteStorage, + attempts_to_fail: u64, + attempt_failure_probability: u64, + ) -> Self { assert!(attempts_to_fail > 0); let inner = match inner { GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s), @@ -51,9 +64,11 @@ impl UnreliableWrapper { panic!("Can't wrap unreliable wrapper unreliably") } }; + let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100); UnreliableWrapper { inner, attempts_to_fail, + attempt_failure_probability: actual_attempt_failure_probability, attempts: Mutex::new(HashMap::new()), } } @@ -66,6 +81,7 @@ impl UnreliableWrapper { /// fn attempt(&self, op: RemoteOp) -> anyhow::Result { let mut attempts = self.attempts.lock().unwrap(); + let mut rng = rand::thread_rng(); match attempts.entry(op) { Entry::Occupied(mut e) => { @@ -75,15 +91,19 @@ impl UnreliableWrapper { *p }; - if attempts_before_this >= self.attempts_to_fail { - // let it succeed - e.remove(); - Ok(attempts_before_this) - } else { + /* BEGIN_HADRON */ + // If there are more attempts to fail, fail the request by probability. + if (attempts_before_this < self.attempts_to_fail) + && (rng.gen_range(0..=100) < self.attempt_failure_probability) + { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); Err(error) + } else { + e.remove(); + Ok(attempts_before_this) } + /* END_HADRON */ } Entry::Vacant(e) => { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index e87232474b..59e112654b 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -11,7 +11,7 @@ use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; -use crate::membership::Configuration; +use crate::membership::{Configuration, SafekeeperGeneration}; use crate::{ServerInfo, Term}; #[derive(Debug, Serialize, Deserialize)] @@ -311,3 +311,12 @@ pub struct PullTimelineResponse { pub safekeeper_host: Option, // TODO: add more fields? } + +/// Response to a timeline locate request. +/// Storcon-only API. +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct TimelineLocateResponse { + pub generation: SafekeeperGeneration, + pub sk_set: Vec, + pub new_sk_set: Option>, +} diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs index 2a85f54a01..0b3b5e6c4f 100644 --- a/libs/utils/src/env.rs +++ b/libs/utils/src/env.rs @@ -44,3 +44,63 @@ where } } } + +/* BEGIN_HADRON */ +pub enum DeploymentMode { + Local, + Dev, + Staging, + Prod, +} + +pub fn get_deployment_mode() -> Option { + match std::env::var("DEPLOYMENT_MODE") { + Ok(env) => match env.as_str() { + "development" => Some(DeploymentMode::Dev), + "staging" => Some(DeploymentMode::Staging), + "production" => Some(DeploymentMode::Prod), + _ => { + tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env); + None + } + }, + Err(_) => { + // tracing::error!("DEPLOYMENT_MODE not set"); + None + } + } +} + +pub fn is_dev_or_staging() -> bool { + matches!( + get_deployment_mode(), + Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging) + ) +} + +pub enum TestingMode { + Chaos, + Stress, +} + +pub fn get_test_mode() -> Option { + match std::env::var("HADRON_TEST_MODE") { + Ok(env) => match env.as_str() { + "chaos" => Some(TestingMode::Chaos), + "stress" => Some(TestingMode::Stress), + _ => { + tracing::error!("Unexpected HADRON_TEST_MODE: {}", env); + None + } + }, + Err(_) => { + tracing::error!("HADRON_TEST_MODE not set"); + None + } + } +} + +pub fn is_chaos_testing() -> bool { + matches!(get_test_mode(), Some(TestingMode::Chaos)) +} +/* END_HADRON */ diff --git a/libs/utils/src/ip_address.rs b/libs/utils/src/ip_address.rs new file mode 100644 index 0000000000..d0834d0ba5 --- /dev/null +++ b/libs/utils/src/ip_address.rs @@ -0,0 +1,73 @@ +use std::env::{VarError, var}; +use std::error::Error; +use std::net::IpAddr; +use std::str::FromStr; + +/// Name of the environment variable containing the reachable IP address of the node. If set, the IP address contained in this +/// environment variable is used as the reachable IP address of the pageserver or safekeeper node during node registration. +/// In a Kubernetes environment, this environment variable should be set by Kubernetes to the Pod IP (specified in the Pod +/// template). +pub const HADRON_NODE_IP_ADDRESS: &str = "HADRON_NODE_IP_ADDRESS"; + +/// Read the reachable IP address of this page server from env var HADRON_NODE_IP_ADDRESS. +/// In Kubernetes this environment variable is set to the Pod IP (specified in the Pod template). +pub fn read_node_ip_addr_from_env() -> Result, Box> { + match var(HADRON_NODE_IP_ADDRESS) { + Ok(v) => { + if let Ok(addr) = IpAddr::from_str(&v) { + Ok(Some(addr)) + } else { + Err(format!("Invalid IP address string: {v}. Cannot be parsed as either an IPv4 or an IPv6 address.").into()) + } + } + Err(VarError::NotPresent) => Ok(None), + Err(e) => Err(e.into()), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::env; + use std::net::{Ipv4Addr, Ipv6Addr}; + + #[test] + fn test_read_node_ip_addr_from_env() { + // SAFETY: test code + unsafe { + // Test with a valid IPv4 address + env::set_var(HADRON_NODE_IP_ADDRESS, "192.168.1.1"); + let result = read_node_ip_addr_from_env().unwrap(); + assert_eq!(result, Some(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)))); + + // Test with a valid IPv6 address + env::set_var( + HADRON_NODE_IP_ADDRESS, + "2001:0db8:85a3:0000:0000:8a2e:0370:7334", + ); + } + let result = read_node_ip_addr_from_env().unwrap(); + assert_eq!( + result, + Some(IpAddr::V6( + Ipv6Addr::from_str("2001:0db8:85a3:0000:0000:8a2e:0370:7334").unwrap() + )) + ); + + // Test with an invalid IP address + // SAFETY: test code + unsafe { + env::set_var(HADRON_NODE_IP_ADDRESS, "invalid_ip"); + } + let result = read_node_ip_addr_from_env(); + assert!(result.is_err()); + + // Test with no environment variable set + // SAFETY: test code + unsafe { + env::remove_var(HADRON_NODE_IP_ADDRESS); + } + let result = read_node_ip_addr_from_env().unwrap(); + assert_eq!(result, None); + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 11f787562c..69771be5dc 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -26,6 +26,9 @@ pub mod auth; // utility functions and helper traits for unified unique id generation/serialization etc. pub mod id; +// utility functions to obtain reachable IP addresses in PS/SK nodes. +pub mod ip_address; + pub mod shard; mod hex; @@ -99,6 +102,8 @@ pub mod elapsed_accum; #[cfg(target_os = "linux")] pub mod linux_socket_ioctl; +pub mod metrics_collector; + // Re-export used in macro. Avoids adding git-version as dep in target crates. #[doc(hidden)] pub use git_version; diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 5828a400a0..d67c0f123b 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,4 +1,5 @@ use std::future::Future; +use std::pin::Pin; use std::str::FromStr; use std::time::Duration; @@ -7,7 +8,7 @@ use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, VariantNames}; use tokio::time::Instant; -use tracing::info; +use tracing::{info, warn}; /// Logs a critical error, similarly to `tracing::error!`. This will: /// @@ -377,10 +378,11 @@ impl std::fmt::Debug for SecretString { /// /// TODO: consider upgrading this to a warning, but currently it fires too often. #[inline] -pub async fn log_slow(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O -where - F: Future, -{ +pub async fn log_slow( + name: &str, + threshold: Duration, + f: Pin<&mut impl Future>, +) -> O { monitor_slow_future( threshold, threshold, // period = threshold @@ -394,16 +396,42 @@ where if !is_slow { return; } + let elapsed = elapsed_total.as_secs_f64(); if ready { - info!( - "slow {name} completed after {:.3}s", - elapsed_total.as_secs_f64() - ); + info!("slow {name} completed after {elapsed:.3}s"); } else { - info!( - "slow {name} still running after {:.3}s", - elapsed_total.as_secs_f64() - ); + info!("slow {name} still running after {elapsed:.3}s"); + } + }, + ) + .await +} + +/// Logs a periodic warning if a future is slow to complete. +#[inline] +pub async fn warn_slow( + name: &str, + threshold: Duration, + f: Pin<&mut impl Future>, +) -> O { + monitor_slow_future( + threshold, + threshold, // period = threshold + f, + |MonitorSlowFutureCallback { + ready, + is_slow, + elapsed_total, + elapsed_since_last_callback: _, + }| { + if !is_slow { + return; + } + let elapsed = elapsed_total.as_secs_f64(); + if ready { + warn!("slow {name} completed after {elapsed:.3}s"); + } else { + warn!("slow {name} still running after {elapsed:.3}s"); } }, ) @@ -416,7 +444,7 @@ where pub async fn monitor_slow_future( threshold: Duration, period: Duration, - mut fut: std::pin::Pin<&mut F>, + mut fut: Pin<&mut F>, mut cb: impl FnMut(MonitorSlowFutureCallback), ) -> O where diff --git a/libs/utils/src/metrics_collector.rs b/libs/utils/src/metrics_collector.rs new file mode 100644 index 0000000000..9e57fcd643 --- /dev/null +++ b/libs/utils/src/metrics_collector.rs @@ -0,0 +1,75 @@ +use std::{ + sync::{Arc, RwLock}, + time::{Duration, Instant}, +}; + +use metrics::{IntGauge, proto::MetricFamily, register_int_gauge}; +use once_cell::sync::Lazy; + +pub static METRICS_STALE_MILLIS: Lazy = Lazy::new(|| { + register_int_gauge!( + "metrics_metrics_stale_milliseconds", + "The current metrics stale time in milliseconds" + ) + .expect("failed to define a metric") +}); + +#[derive(Debug)] +pub struct CollectedMetrics { + pub metrics: Vec, + pub collected_at: Instant, +} + +impl CollectedMetrics { + fn new(metrics: Vec) -> Self { + Self { + metrics, + collected_at: Instant::now(), + } + } +} + +#[derive(Debug)] +pub struct MetricsCollector { + last_collected: RwLock>, +} + +impl MetricsCollector { + pub fn new() -> Self { + Self { + last_collected: RwLock::new(Arc::new(CollectedMetrics::new(vec![]))), + } + } + + #[tracing::instrument(name = "metrics_collector", skip_all)] + pub fn run_once(&self, cache_metrics: bool) -> Arc { + let started = Instant::now(); + let metrics = metrics::gather(); + let collected = Arc::new(CollectedMetrics::new(metrics)); + if cache_metrics { + let mut guard = self.last_collected.write().unwrap(); + *guard = collected.clone(); + } + tracing::info!( + "Collected {} metric families in {} ms", + collected.metrics.len(), + started.elapsed().as_millis() + ); + collected + } + + pub fn last_collected(&self) -> Arc { + self.last_collected.read().unwrap().clone() + } +} + +impl Default for MetricsCollector { + fn default() -> Self { + Self::new() + } +} + +// Interval for metrics collection. Currently hard-coded to be the same as the metrics scape interval from the obs agent +pub static METRICS_COLLECTION_INTERVAL: Duration = Duration::from_secs(30); + +pub static METRICS_COLLECTOR: Lazy = Lazy::new(MetricsCollector::default); diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 7c6abf252e..5f856a44d4 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -428,6 +428,12 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { shard_number: 0, }; + let empty_wal_rate_limiter = crate::bindings::WalRateLimiter { + should_limit: crate::bindings::pg_atomic_uint32 { value: 0 }, + sent_bytes: 0, + last_recorded_time_us: 0, + }; + crate::bindings::WalproposerShmemState { propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 }, donor_name: [0; 64], @@ -441,6 +447,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { num_shards: 0, replica_promote: false, min_ps_feedback: empty_feedback, + wal_rate_limiter: empty_wal_rate_limiter, } } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index af4be23b9b..fe1ddc2e7d 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::error::Error as _; use std::time::Duration; @@ -251,6 +251,70 @@ impl Client { Ok(()) } + pub async fn tenant_timeline_compact( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + force_image_layer_creation: bool, + must_force_image_layer_creation: bool, + scheduled: bool, + wait_until_done: bool, + ) -> Result<()> { + let mut path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/compact", + self.mgmt_api_endpoint + )) + .expect("Cannot build URL"); + + if force_image_layer_creation { + path.query_pairs_mut() + .append_pair("force_image_layer_creation", "true"); + } + + if must_force_image_layer_creation { + path.query_pairs_mut() + .append_pair("must_force_image_layer_creation", "true"); + } + + if scheduled { + path.query_pairs_mut().append_pair("scheduled", "true"); + } + if wait_until_done { + path.query_pairs_mut() + .append_pair("wait_until_scheduled_compaction_done", "true"); + path.query_pairs_mut() + .append_pair("wait_until_uploaded", "true"); + } + self.request(Method::PUT, path, ()).await?; + Ok(()) + } + + /* BEGIN_HADRON */ + pub async fn tenant_timeline_describe( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + ) -> Result { + let mut path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", + self.mgmt_api_endpoint + )) + .expect("Cannot build URL"); + path.query_pairs_mut() + .append_pair("include-image-consistent-lsn", "true"); + + let response: reqwest::Response = self.request(Method::GET, path, ()).await?; + let body = response.json().await.map_err(Error::ReceiveBody)?; + Ok(body) + } + + pub async fn list_tenant_visible_size(&self) -> Result> { + let uri = format!("{}/v1/list_tenant_visible_size", self.mgmt_api_endpoint); + let resp = self.get(&uri).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + /* END_HADRON */ + pub async fn tenant_scan_remote_storage( &self, tenant_id: TenantId, diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml index 84e27abb84..e2741ad839 100644 --- a/pageserver/client_grpc/Cargo.toml +++ b/pageserver/client_grpc/Cargo.toml @@ -4,8 +4,12 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +testing = ["pageserver_api/testing"] + [dependencies] anyhow.workspace = true +arc-swap.workspace = true bytes.workspace = true compute_api.workspace = true futures.workspace = true @@ -13,6 +17,7 @@ pageserver_api.workspace = true pageserver_page_api.workspace = true tokio.workspace = true tokio-stream.workspace = true +tokio-util.workspace = true tonic.workspace = true tracing.workspace = true utils.workspace = true diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs index 63852868c3..3a9edc7092 100644 --- a/pageserver/client_grpc/src/client.rs +++ b/pageserver/client_grpc/src/client.rs @@ -1,11 +1,16 @@ use std::collections::HashMap; use std::num::NonZero; +use std::pin::pin; use std::sync::Arc; +use std::time::{Duration, Instant}; use anyhow::anyhow; +use arc_swap::ArcSwap; use futures::stream::FuturesUnordered; use futures::{FutureExt as _, StreamExt as _}; -use tracing::instrument; +use tonic::codec::CompressionEncoding; +use tracing::{debug, instrument}; +use utils::logging::warn_slow; use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool}; use crate::retry::Retry; @@ -19,28 +24,40 @@ use utils::shard::{ShardCount, ShardIndex, ShardNumber}; /// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up /// when full. /// +/// Normal requests are small, and we don't pipeline them, so we can afford a large number of +/// streams per connection. +/// /// TODO: tune all of these constants, and consider making them configurable. -/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels -/// with only streams. -const MAX_CLIENTS_PER_CHANNEL: NonZero = NonZero::new(16).unwrap(); +const MAX_CLIENTS_PER_CHANNEL: NonZero = NonZero::new(64).unwrap(); -/// Max number of concurrent unary request clients per shard. -const MAX_UNARY_CLIENTS: NonZero = NonZero::new(64).unwrap(); +/// Max number of concurrent bulk GetPage streams per channel (i.e. TCP connection). These use a +/// dedicated channel pool with a lower client limit, to avoid TCP-level head-of-line blocking and +/// transmission delays. This also concentrates large window sizes on a smaller set of +/// streams/connections, presumably reducing memory use. +const MAX_BULK_CLIENTS_PER_CHANNEL: NonZero = NonZero::new(16).unwrap(); -/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage -/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`. -const MAX_STREAMS: NonZero = NonZero::new(64).unwrap(); +/// The batch size threshold at which a GetPage request will use the bulk stream pool. +/// +/// The gRPC initial window size is 64 KB. Each page is 8 KB, so let's avoid increasing the window +/// size for the normal stream pool, and route requests for >= 5 pages (>32 KB) to the bulk pool. +const BULK_THRESHOLD_BATCH_SIZE: usize = 5; -/// Max number of pipelined requests per stream. -const MAX_STREAM_QUEUE_DEPTH: NonZero = NonZero::new(2).unwrap(); +/// The overall request call timeout, including retries and pool acquisition. +/// TODO: should we retry forever? Should the caller decide? +const CALL_TIMEOUT: Duration = Duration::from_secs(60); -/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these -/// are more throughput-oriented, we have a smaller limit but higher queue depth. -const MAX_BULK_STREAMS: NonZero = NonZero::new(16).unwrap(); +/// The per-request (retry attempt) timeout, including any lazy connection establishment. +const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); -/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus -/// get a larger queue depth. -const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero = NonZero::new(4).unwrap(); +/// The initial request retry backoff duration. The first retry does not back off. +/// TODO: use a different backoff for ResourceExhausted (rate limiting)? Needs server support. +const BASE_BACKOFF: Duration = Duration::from_millis(5); + +/// The maximum request retry backoff duration. +const MAX_BACKOFF: Duration = Duration::from_secs(5); + +/// Threshold and interval for warning about slow operation. +const SLOW_THRESHOLD: Duration = Duration::from_secs(3); /// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the /// basic `page_api::Client` gRPC client, and supports: @@ -48,48 +65,113 @@ const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero = NonZero::new(4).unwrap(); /// * Sharded tenants across multiple Pageservers. /// * Pooling of connections, clients, and streams for efficient resource use. /// * Concurrent use by many callers. -/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling. +/// * Internal handling of GetPage bidirectional streams. /// * Automatic retries. /// * Observability. /// +/// The client has dedicated connection/client/stream pools per shard, for resource reuse. These +/// pools are unbounded: we allow scaling out as many concurrent streams as needed to serve all +/// concurrent callers, which mostly eliminates head-of-line blocking. Idle streams are fairly +/// cheap: the server task currently uses 26 KB of memory, so we can comfortably fit 100,000 +/// concurrent idle streams (2.5 GB memory). The worst case degenerates to the old libpq case with +/// one stream per backend, but without the TCP connection overhead. In the common case we expect +/// significantly lower stream counts due to stream sharing, driven e.g. by idle backends, LFC hits, +/// read coalescing, sharding (backends typically only talk to one shard at a time), etc. +/// /// TODO: this client does not support base backups or LSN leases, as these are only used by /// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards. pub struct PageserverClient { - // TODO: support swapping out the shard map, e.g. via an ArcSwap. - shards: Shards, - retry: Retry, + /// The tenant ID. + tenant_id: TenantId, + /// The timeline ID. + timeline_id: TimelineId, + /// The JWT auth token for this tenant, if any. + auth_token: Option, + /// The compression to use, if any. + compression: Option, + /// The shards for this tenant. + shards: ArcSwap, } impl PageserverClient { /// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given - /// in the shard map, which must be complete and must use gRPC URLs. + /// in the shard spec, which must be complete and must use gRPC URLs. pub fn new( tenant_id: TenantId, timeline_id: TimelineId, - shard_map: HashMap, - stripe_size: ShardStripeSize, + shard_spec: ShardSpec, auth_token: Option, + compression: Option, ) -> anyhow::Result { - let shards = Shards::new(tenant_id, timeline_id, shard_map, stripe_size, auth_token)?; + let shards = Shards::new( + tenant_id, + timeline_id, + shard_spec, + auth_token.clone(), + compression, + )?; Ok(Self { - shards, - retry: Retry, + tenant_id, + timeline_id, + auth_token, + compression, + shards: ArcSwap::new(Arc::new(shards)), }) } + /// Updates the shards from the given shard spec. In-flight requests will complete using the + /// existing shards, but may retry with the new shards if they fail. + /// + /// TODO: verify that in-flight requests are allowed to complete, and that the old pools are + /// properly spun down and dropped afterwards. + pub fn update_shards(&self, shard_spec: ShardSpec) -> anyhow::Result<()> { + // Validate the shard spec. We should really use `ArcSwap::rcu` for this, to avoid races + // with concurrent updates, but that involves creating a new `Shards` on every attempt, + // which spins up a bunch of Tokio tasks and such. These should already be checked elsewhere + // in the stack, and if they're violated then we already have problems elsewhere, so a + // best-effort but possibly-racy check is okay here. + let old = self.shards.load_full(); + if shard_spec.count < old.count { + return Err(anyhow!( + "can't reduce shard count from {} to {}", + old.count, + shard_spec.count + )); + } + if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size { + return Err(anyhow!( + "can't change stripe size from {} to {}", + old.stripe_size, + shard_spec.stripe_size + )); + } + + let shards = Shards::new( + self.tenant_id, + self.timeline_id, + shard_spec, + self.auth_token.clone(), + self.compression, + )?; + self.shards.store(Arc::new(shards)); + Ok(()) + } + /// Returns whether a relation exists. #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))] pub async fn check_rel_exists( &self, req: page_api::CheckRelExistsRequest, ) -> tonic::Result { - self.retry - .with(async || { - // Relation metadata is only available on shard 0. - let mut client = self.shards.get_zero().client().await?; - client.check_rel_exists(req).await - }) - .await + debug!("sending request: {req:?}"); + let resp = Self::with_retries(CALL_TIMEOUT, async |_| { + // Relation metadata is only available on shard 0. + let mut client = self.shards.load_full().get_zero().client().await?; + Self::with_timeout(REQUEST_TIMEOUT, client.check_rel_exists(req)).await + }) + .await?; + debug!("received response: {resp:?}"); + Ok(resp) } /// Returns the total size of a database, as # of bytes. @@ -98,17 +180,20 @@ impl PageserverClient { &self, req: page_api::GetDbSizeRequest, ) -> tonic::Result { - self.retry - .with(async || { - // Relation metadata is only available on shard 0. - let mut client = self.shards.get_zero().client().await?; - client.get_db_size(req).await - }) - .await + debug!("sending request: {req:?}"); + let resp = Self::with_retries(CALL_TIMEOUT, async |_| { + // Relation metadata is only available on shard 0. + let mut client = self.shards.load_full().get_zero().client().await?; + Self::with_timeout(REQUEST_TIMEOUT, client.get_db_size(req)).await + }) + .await?; + debug!("received response: {resp:?}"); + Ok(resp) } - /// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically - /// splits requests that straddle shard boundaries, and assembles the responses. + /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the + /// `attempt` must be 0 (incremented on retry). Automatically splits requests that straddle + /// shard boundaries, and assembles the responses. /// /// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status` /// errors. All responses will have `GetPageStatusCode::Ok`. @@ -128,72 +213,101 @@ impl PageserverClient { if req.block_numbers.is_empty() { return Err(tonic::Status::invalid_argument("no block number")); } + // The request attempt must be 0. The client will increment it internally. + if req.request_id.attempt != 0 { + return Err(tonic::Status::invalid_argument("request attempt must be 0")); + } + debug!("sending request: {req:?}"); + + // The shards may change while we're fetching pages. We execute the request using a stable + // view of the shards (especially important for requests that span shards), but retry the + // top-level (pre-split) request to pick up shard changes. This can lead to unnecessary + // retries and re-splits in some cases where requests span shards, but these are expected to + // be rare. + // + // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this + // once we figure out how to handle these. + let resp = Self::with_retries(CALL_TIMEOUT, async |attempt| { + let mut req = req.clone(); + req.request_id.attempt = attempt as u32; + let shards = self.shards.load_full(); + Self::with_timeout(REQUEST_TIMEOUT, Self::get_page_with_shards(req, &shards)).await + }) + .await?; + + debug!("received response: {resp:?}"); + Ok(resp) + } + + /// Fetches pages using the given shards. This uses a stable view of the shards, regardless of + /// concurrent shard updates. Does not retry internally, but is retried by `get_page()`. + async fn get_page_with_shards( + req: page_api::GetPageRequest, + shards: &Shards, + ) -> tonic::Result { // Fast path: request is for a single shard. if let Some(shard_id) = - GetPageSplitter::is_single_shard(&req, self.shards.count, self.shards.stripe_size) + GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size) { - return self.get_page_for_shard(shard_id, req).await; + return Self::get_page_with_shard(req, shards.get(shard_id)?).await; } // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and // reassemble the responses. - // - // TODO: when we support shard map updates, we need to detect when it changes and re-split - // the request on errors. - let mut splitter = GetPageSplitter::split(req, self.shards.count, self.shards.stripe_size); + let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size); - let mut shard_requests: FuturesUnordered<_> = splitter - .drain_requests() - .map(|(shard_id, shard_req)| { - // NB: each request will retry internally. - self.get_page_for_shard(shard_id, shard_req) - .map(move |result| result.map(|resp| (shard_id, resp))) - }) - .collect(); + let mut shard_requests = FuturesUnordered::new(); + for (shard_id, shard_req) in splitter.drain_requests() { + let future = Self::get_page_with_shard(shard_req, shards.get(shard_id)?) + .map(move |result| result.map(|resp| (shard_id, resp))); + shard_requests.push(future); + } while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? { splitter.add_response(shard_id, shard_response)?; } - splitter.assemble_response() + splitter.get_response() } - /// Fetches pages that belong to the given shard. - #[instrument(skip_all, fields(shard = %shard_id))] - async fn get_page_for_shard( - &self, - shard_id: ShardIndex, + /// Fetches pages on the given shard. Does not retry internally. + async fn get_page_with_shard( req: page_api::GetPageRequest, + shard: &Shard, ) -> tonic::Result { - let resp = self - .retry - .with(async || { - let stream = self - .shards - .get(shard_id)? - .stream(req.request_class.is_bulk()) - .await; - let resp = stream.send(req.clone()).await?; + let mut stream = shard.stream(Self::is_bulk(&req)).await?; + let resp = stream.send(req.clone()).await?; - // Convert per-request errors into a tonic::Status. - if resp.status_code != page_api::GetPageStatusCode::Ok { - return Err(tonic::Status::new( - resp.status_code.into(), - resp.reason.unwrap_or_else(|| String::from("unknown error")), - )); - } + // Convert per-request errors into a tonic::Status. + if resp.status_code != page_api::GetPageStatusCode::Ok { + return Err(tonic::Status::new( + resp.status_code.into(), + resp.reason.unwrap_or_else(|| String::from("unknown error")), + )); + } - Ok(resp) - }) - .await?; - - // Make sure we got the right number of pages. - // NB: check outside of the retry loop, since we don't want to retry this. - let (expected, actual) = (req.block_numbers.len(), resp.page_images.len()); - if expected != actual { + // Check that we received the expected pages. + if req.rel != resp.rel { return Err(tonic::Status::internal(format!( - "expected {expected} pages for shard {shard_id}, got {actual}", + "shard {} returned wrong relation, expected {} got {}", + shard.id, req.rel, resp.rel + ))); + } + if !req + .block_numbers + .iter() + .copied() + .eq(resp.pages.iter().map(|p| p.block_number)) + { + return Err(tonic::Status::internal(format!( + "shard {} returned wrong pages, expected {:?} got {:?}", + shard.id, + req.block_numbers, + resp.pages + .iter() + .map(|page| page.block_number) + .collect::>() ))); } @@ -206,13 +320,15 @@ impl PageserverClient { &self, req: page_api::GetRelSizeRequest, ) -> tonic::Result { - self.retry - .with(async || { - // Relation metadata is only available on shard 0. - let mut client = self.shards.get_zero().client().await?; - client.get_rel_size(req).await - }) - .await + debug!("sending request: {req:?}"); + let resp = Self::with_retries(CALL_TIMEOUT, async |_| { + // Relation metadata is only available on shard 0. + let mut client = self.shards.load_full().get_zero().client().await?; + Self::with_timeout(REQUEST_TIMEOUT, client.get_rel_size(req)).await + }) + .await?; + debug!("received response: {resp:?}"); + Ok(resp) } /// Fetches an SLRU segment. @@ -221,51 +337,91 @@ impl PageserverClient { &self, req: page_api::GetSlruSegmentRequest, ) -> tonic::Result { - self.retry - .with(async || { - // SLRU segments are only available on shard 0. - let mut client = self.shards.get_zero().client().await?; - client.get_slru_segment(req).await - }) - .await + debug!("sending request: {req:?}"); + let resp = Self::with_retries(CALL_TIMEOUT, async |_| { + // SLRU segments are only available on shard 0. + let mut client = self.shards.load_full().get_zero().client().await?; + Self::with_timeout(REQUEST_TIMEOUT, client.get_slru_segment(req)).await + }) + .await?; + debug!("received response: {resp:?}"); + Ok(resp) + } + + /// Runs the given async closure with retries up to the given timeout. Only certain gRPC status + /// codes are retried, see [`Retry::should_retry`]. Returns `DeadlineExceeded` on timeout. + async fn with_retries(timeout: Duration, f: F) -> tonic::Result + where + F: FnMut(usize) -> O, // pass attempt number, starting at 0 + O: Future>, + { + Retry { + timeout: Some(timeout), + base_backoff: BASE_BACKOFF, + max_backoff: MAX_BACKOFF, + } + .with(f) + .await + } + + /// Runs the given future with a timeout. Returns `DeadlineExceeded` on timeout. + async fn with_timeout( + timeout: Duration, + f: impl Future>, + ) -> tonic::Result { + let started = Instant::now(); + tokio::time::timeout(timeout, f).await.map_err(|_| { + tonic::Status::deadline_exceeded(format!( + "request timed out after {:.3}s", + started.elapsed().as_secs_f64() + )) + })? + } + + /// Returns true if the request is considered a bulk request and should use the bulk pool. + fn is_bulk(req: &page_api::GetPageRequest) -> bool { + req.block_numbers.len() >= BULK_THRESHOLD_BATCH_SIZE } } -/// Tracks the tenant's shards. -struct Shards { +/// Shard specification for a PageserverClient. +pub struct ShardSpec { + /// Maps shard indices to gRPC URLs. + /// + /// INVARIANT: every shard 0..count is present, and shard 0 is always present. + /// INVARIANT: every URL is valid and uses grpc:// scheme. + urls: HashMap, /// The shard count. /// /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention. count: ShardCount, - /// The stripe size. Only used for sharded tenants. + /// The stripe size for these shards. stripe_size: ShardStripeSize, - /// Shards by shard index. - /// - /// NB: unsharded tenants use count 0, like `ShardIndex::unsharded()`. - /// - /// INVARIANT: every shard 0..count is present. - /// INVARIANT: shard 0 is always present. - map: HashMap, } -impl Shards { - /// Creates a new set of shards based on a shard map. - fn new( - tenant_id: TenantId, - timeline_id: TimelineId, - shard_map: HashMap, - stripe_size: ShardStripeSize, - auth_token: Option, +impl ShardSpec { + /// Creates a new shard spec with the given URLs and stripe size. All shards must be given. + /// The stripe size may be omitted for unsharded tenants. + pub fn new( + urls: HashMap, + stripe_size: Option, ) -> anyhow::Result { - let count = match shard_map.len() { + // Compute the shard count. + let count = match urls.len() { 0 => return Err(anyhow!("no shards provided")), 1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()` n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")), n => ShardCount::new(n as u8), }; - let mut map = HashMap::new(); - for (shard_id, url) in shard_map { + // Determine the stripe size. It doesn't matter for unsharded tenants. + if stripe_size.is_none() && !count.is_unsharded() { + return Err(anyhow!("stripe size must be given for sharded tenants")); + } + let stripe_size = stripe_size.unwrap_or_default(); + + // Validate the shard spec. + for (shard_id, url) in &urls { // The shard index must match the computed shard count, even for unsharded tenants. if shard_id.shard_count != count { return Err(anyhow!("invalid shard index {shard_id}, expected {count}")); @@ -276,21 +432,72 @@ impl Shards { } // The above conditions guarantee that we have all shards 0..count: len() matches count, // shard number < count, and numbers are unique (via hashmap). - let shard = Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?; - map.insert(shard_id, shard); + + // Validate the URL. + if PageserverProtocol::from_connstring(url)? != PageserverProtocol::Grpc { + return Err(anyhow!("invalid shard URL {url}: must use gRPC")); + } } Ok(Self { + urls, count, stripe_size, - map, + }) + } +} + +/// Tracks the tenant's shards. +struct Shards { + /// Shards by shard index. + /// + /// INVARIANT: every shard 0..count is present. + /// INVARIANT: shard 0 is always present. + by_index: HashMap, + /// The shard count. + /// + /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention. + count: ShardCount, + /// The stripe size. Only used for sharded tenants. + stripe_size: ShardStripeSize, +} + +impl Shards { + /// Creates a new set of shards based on a shard spec. + fn new( + tenant_id: TenantId, + timeline_id: TimelineId, + shard_spec: ShardSpec, + auth_token: Option, + compression: Option, + ) -> anyhow::Result { + // NB: the shard spec has already been validated when constructed. + let mut shards = HashMap::with_capacity(shard_spec.urls.len()); + for (shard_id, url) in shard_spec.urls { + shards.insert( + shard_id, + Shard::new( + url, + tenant_id, + timeline_id, + shard_id, + auth_token.clone(), + compression, + )?, + ); + } + + Ok(Self { + by_index: shards, + count: shard_spec.count, + stripe_size: shard_spec.stripe_size, }) } /// Looks up the given shard. #[allow(clippy::result_large_err)] // TODO: check perf impact fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> { - self.map + self.by_index .get(&shard_id) .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}"))) } @@ -302,21 +509,31 @@ impl Shards { } } -/// A single shard. Uses dedicated resource pools with the following structure: +/// A single shard. Has dedicated resource pools with the following structure: /// -/// * Channel pool: unbounded. -/// * Unary client pool: MAX_UNARY_CLIENTS. -/// * Stream client pool: unbounded. -/// * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH. -/// * Bulk channel pool: unbounded. +/// * Channel pool: MAX_CLIENTS_PER_CHANNEL. +/// * Client pool: unbounded. +/// * Stream pool: unbounded. +/// * Bulk channel pool: MAX_BULK_CLIENTS_PER_CHANNEL. /// * Bulk client pool: unbounded. -/// * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH. +/// * Bulk stream pool: unbounded. +/// +/// We use a separate bulk channel pool with a lower concurrency limit for large batch requests. +/// This avoids TCP-level head-of-line blocking, and also concentrates large window sizes on a +/// smaller set of streams/connections, which presumably reduces memory use. Neither of these pools +/// are bounded, nor do they pipeline requests, so the latency characteristics should be mostly +/// similar (except for TCP transmission time). +/// +/// TODO: since we never use bounded pools, we could consider removing the pool limiters. However, +/// the code is fairly trivial, so we may as well keep them around for now in case we need them. struct Shard { + /// The shard ID. + id: ShardIndex, /// Unary gRPC client pool. client_pool: Arc, /// GetPage stream pool. stream_pool: Arc, - /// GetPage stream pool for bulk requests, e.g. prefetches. + /// GetPage stream pool for bulk requests. bulk_stream_pool: Arc, } @@ -328,56 +545,36 @@ impl Shard { timeline_id: TimelineId, shard_id: ShardIndex, auth_token: Option, + compression: Option, ) -> anyhow::Result { - // Sanity-check that the URL uses gRPC. - if PageserverProtocol::from_connstring(&url)? != PageserverProtocol::Grpc { - return Err(anyhow!("invalid shard URL {url}: must use gRPC")); - } - - // Common channel pool for unary and stream requests. Bounded by client/stream pools. - let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?; - - // Client pool for unary requests. + // Shard pools for unary requests and non-bulk GetPage requests. let client_pool = ClientPool::new( - channel_pool.clone(), + ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?, tenant_id, timeline_id, shard_id, auth_token.clone(), - Some(MAX_UNARY_CLIENTS), + compression, + None, // unbounded ); + let stream_pool = StreamPool::new(client_pool.clone(), None); // unbounded - // GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients, - // but shares a channel pool with it (as it's unbounded). - let stream_pool = StreamPool::new( - ClientPool::new( - channel_pool.clone(), - tenant_id, - timeline_id, - shard_id, - auth_token.clone(), - None, // unbounded, limited by stream pool - ), - Some(MAX_STREAMS), - MAX_STREAM_QUEUE_DEPTH, - ); - - // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools - // to avoid head-of-line blocking of latency-sensitive requests. + // Bulk GetPage stream pool for large batches (prefetches, sequential scans, vacuum, etc.). let bulk_stream_pool = StreamPool::new( ClientPool::new( - ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?, + ChannelPool::new(url, MAX_BULK_CLIENTS_PER_CHANNEL)?, tenant_id, timeline_id, shard_id, auth_token, - None, // unbounded, limited by stream pool + compression, + None, // unbounded, ), - Some(MAX_BULK_STREAMS), - MAX_BULK_STREAM_QUEUE_DEPTH, + None, // unbounded ); Ok(Self { + id: shard_id, client_pool, stream_pool, bulk_stream_pool, @@ -385,19 +582,23 @@ impl Shard { } /// Returns a pooled client for this shard. + #[instrument(skip_all)] async fn client(&self) -> tonic::Result { - self.client_pool - .get() - .await - .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}"))) + warn_slow( + "client pool acquisition", + SLOW_THRESHOLD, + pin!(self.client_pool.get()), + ) + .await } - /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream - /// pool (e.g. for prefetches). - async fn stream(&self, bulk: bool) -> StreamGuard { - match bulk { - false => self.stream_pool.get().await, - true => self.bulk_stream_pool.get().await, - } + /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk pool. + #[instrument(skip_all, fields(bulk))] + async fn stream(&self, bulk: bool) -> tonic::Result { + let pool = match bulk { + false => &self.stream_pool, + true => &self.bulk_stream_pool, + }; + warn_slow("stream pool acquisition", SLOW_THRESHOLD, pin!(pool.get())).await } } diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs index 3fc7178be2..14fb3fbd5a 100644 --- a/pageserver/client_grpc/src/lib.rs +++ b/pageserver/client_grpc/src/lib.rs @@ -3,4 +3,4 @@ mod pool; mod retry; mod split; -pub use client::PageserverClient; +pub use client::{PageserverClient, ShardSpec}; diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs index 5a50004fd1..98a649b4c8 100644 --- a/pageserver/client_grpc/src/pool.rs +++ b/pageserver/client_grpc/src/pool.rs @@ -9,19 +9,36 @@ //! //! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients //! can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a -//! per-channel client limit. Channels may be closed when they are no longer used by any clients. +//! per-channel client limit. Channels are closed immediately when empty, and indirectly rely on +//! client/stream idle timeouts. //! //! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared) //! channel from the ChannelPool for the client's lifetime. A client can only be acquired by a -//! single caller at a time, and is returned to the pool when dropped. Idle clients may be removed -//! from the pool after some time, to free up the channel. +//! single caller at a time, and is returned to the pool when dropped. Idle clients are removed +//! from the pool after a while to free up resources. //! //! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the -//! ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it -//! returns a guard that can be used to send a single request, to properly enforce queue depth and -//! route responses. Internally, the pool will reuse or spin up a suitable stream for the request, -//! possibly pipelining multiple requests from multiple callers on the same stream (up to some -//! queue depth). Idle streams may be removed from the pool after a while to free up the client. +//! ClientPool for the stream's lifetime. A stream can only be acquired by a single caller at a +//! time, and is returned to the pool when dropped. Idle streams are removed from the pool after +//! a while to free up resources. +//! +//! The stream only supports sending a single, synchronous request at a time, and does not support +//! pipelining multiple requests from different callers onto the same stream -- instead, we scale +//! out concurrent streams to improve throughput. There are many reasons for this design choice: +//! +//! * It (mostly) eliminates head-of-line blocking. A single stream is processed sequentially by +//! a single server task, which may block e.g. on layer downloads, LSN waits, etc. +//! +//! * Cancellation becomes trivial, by closing the stream. Otherwise, if a caller goes away +//! (e.g. because of a timeout), the request would still be processed by the server and block +//! requests behind it in the stream. It might even block its own timeout retry. +//! +//! * Stream scheduling becomes significantly simpler and cheaper. +//! +//! * Individual callers can still use client-side batching for pipelining. +//! +//! * Idle streams are cheap. Benchmarks show that an idle GetPage stream takes up about 26 KB +//! per stream (2.5 GB for 100,000 streams), so we can afford to scale out. //! //! Each channel corresponds to one TCP connection. Each client unary request and each stream //! corresponds to one HTTP/2 stream and server task. @@ -29,22 +46,42 @@ //! TODO: error handling (including custom error types). //! TODO: observability. -use std::collections::{BTreeMap, HashMap}; +use std::collections::BTreeMap; use std::num::NonZero; use std::ops::{Deref, DerefMut}; +use std::pin::Pin; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, Weak}; +use std::time::{Duration, Instant}; -use futures::StreamExt as _; -use tokio::sync::mpsc::{Receiver, Sender}; -use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot}; +use futures::{Stream, StreamExt as _}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore, watch}; +use tokio_stream::wrappers::WatchStream; +use tokio_util::sync::CancellationToken; +use tonic::codec::CompressionEncoding; use tonic::transport::{Channel, Endpoint}; -use tracing::{error, warn}; use pageserver_page_api as page_api; use utils::id::{TenantId, TimelineId}; use utils::shard::ShardIndex; +/// Reap clients/streams that have been idle for this long. Channels are reaped immediately when +/// empty, and indirectly rely on the client/stream idle timeouts. +/// +/// A stream's client will be reaped after 2x the idle threshold (first stream the client), but +/// that's okay -- if the stream closes abruptly (e.g. due to timeout or cancellation), we want to +/// keep its client around in the pool for a while. +const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) { + false => Duration::from_secs(180), + true => Duration::from_secs(1), // exercise reaping in tests +}; + +/// Reap idle resources with this interval. +const REAP_IDLE_INTERVAL: Duration = match cfg!(any(test, feature = "testing")) { + false => Duration::from_secs(10), + true => Duration::from_secs(1), // exercise reaping in tests +}; + /// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2 /// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this. /// The pool does not limit the number of channels, and instead relies on `ClientPool` or @@ -52,7 +89,6 @@ use utils::shard::ShardIndex; /// /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. /// -/// TODO: reap idle channels. /// TODO: consider prewarming a set of channels, to avoid initial connection latency. /// TODO: consider adding a circuit breaker for errors and fail fast. pub struct ChannelPool { @@ -108,14 +144,15 @@ impl ChannelPool { let mut channels = self.channels.lock().unwrap(); // Try to find an existing channel with available capacity. We check entries in BTreeMap - // order, to fill up the lower-ordered channels first. The ClientPool also prefers clients - // with lower-ordered channel IDs first. This will cluster clients in lower-ordered + // order, to fill up the lower-ordered channels first. The client/stream pools also prefer + // clients with lower-ordered channel IDs first. This will cluster clients in lower-ordered // channels, and free up higher-ordered channels such that they can be reaped. for (&id, entry) in channels.iter_mut() { assert!( entry.clients <= self.max_clients_per_channel.get(), "channel overflow" ); + assert_ne!(entry.clients, 0, "empty channel not reaped"); if entry.clients < self.max_clients_per_channel.get() { entry.clients += 1; return ChannelGuard { @@ -161,16 +198,22 @@ impl ChannelGuard { } } -/// Returns the channel to the pool. +/// Returns the channel to the pool. The channel is closed when empty. impl Drop for ChannelGuard { fn drop(&mut self) { let Some(pool) = self.pool.upgrade() else { return; // pool was dropped }; + let mut channels = pool.channels.lock().unwrap(); let entry = channels.get_mut(&self.id).expect("unknown channel"); assert!(entry.clients > 0, "channel underflow"); entry.clients -= 1; + + // Reap empty channels immediately. + if entry.clients == 0 { + channels.remove(&self.id); + } } } @@ -179,8 +222,6 @@ impl Drop for ChannelGuard { /// number of concurrent clients to `max_clients` via semaphore. /// /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. -/// -/// TODO: reap idle clients. pub struct ClientPool { /// Tenant ID. tenant_id: TenantId, @@ -190,6 +231,8 @@ pub struct ClientPool { shard_id: ShardIndex, /// Authentication token, if any. auth_token: Option, + /// Compression to use. + compression: Option, /// Channel pool to acquire channels from. channel_pool: Arc, /// Limits the max number of concurrent clients for this pool. None if the pool is unbounded. @@ -198,9 +241,10 @@ pub struct ClientPool { /// /// The first client in the map will be acquired next. The map is sorted by client ID, which in /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from - /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle - /// clients are reaped. + /// lower-ordered channels. This allows us to free up and reap higher-ordered channels. idle: Mutex>, + /// Reaps idle clients. + idle_reaper: Reaper, /// Unique client ID generator. next_client_id: AtomicUsize, } @@ -212,6 +256,9 @@ struct ClientEntry { client: page_api::Client, /// The channel guard for the channel used by the client. channel_guard: ChannelGuard, + /// The client has been idle since this time. All clients in `ClientPool::idle` are idle by + /// definition, so this is the time when it was added back to the pool. + idle_since: Instant, } impl ClientPool { @@ -224,18 +271,23 @@ impl ClientPool { timeline_id: TimelineId, shard_id: ShardIndex, auth_token: Option, + compression: Option, max_clients: Option>, ) -> Arc { - Arc::new(Self { + let pool = Arc::new(Self { tenant_id, timeline_id, shard_id, auth_token, + compression, channel_pool, idle: Mutex::default(), + idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL), limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))), next_client_id: AtomicUsize::default(), - }) + }); + pool.idle_reaper.spawn(&pool); + pool } /// Gets a client from the pool, or creates a new one if necessary. Connections are established @@ -245,7 +297,7 @@ impl ClientPool { /// This is moderately performance-sensitive. It is called for every unary request, but these /// establish a new gRPC stream per request so they're already expensive. GetPage requests use /// the `StreamPool` instead. - pub async fn get(self: &Arc) -> anyhow::Result { + pub async fn get(self: &Arc) -> tonic::Result { // Acquire a permit if the pool is bounded. let mut permit = None; if let Some(limiter) = self.limiter.clone() { @@ -263,7 +315,7 @@ impl ClientPool { }); } - // Slow path: construct a new client. + // Construct a new client. let mut channel_guard = self.channel_pool.get(); let client = page_api::Client::new( channel_guard.take(), @@ -271,8 +323,9 @@ impl ClientPool { self.timeline_id, self.shard_id, self.auth_token.clone(), - None, - )?; + self.compression, + ) + .map_err(|err| tonic::Status::internal(format!("failed to create client: {err}")))?; Ok(ClientGuard { pool: Arc::downgrade(self), @@ -287,6 +340,16 @@ impl ClientPool { } } +impl Reapable for ClientPool { + /// Reaps clients that have been idle since before the cutoff. + fn reap_idle(&self, cutoff: Instant) { + self.idle + .lock() + .unwrap() + .retain(|_, entry| entry.idle_since >= cutoff) + } +} + /// A client acquired from the pool. The inner client can be accessed via Deref. The client is /// returned to the pool when dropped. pub struct ClientGuard { @@ -317,9 +380,11 @@ impl Drop for ClientGuard { let Some(pool) = self.pool.upgrade() else { return; // pool was dropped }; + let entry = ClientEntry { client: self.client.take().expect("dropped once"), channel_guard: self.channel_guard.take().expect("dropped once"), + idle_since: Instant::now(), }; pool.idle.lock().unwrap().insert(self.id, entry); @@ -330,269 +395,268 @@ impl Drop for ClientGuard { /// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream /// acquires a client from the inner `ClientPool` for the stream's lifetime. /// -/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send -/// a single request and await the response. Internally, requests are multiplexed across streams and -/// channels. This allows proper queue depth enforcement and response routing. +/// Individual streams only send a single request at a time, and do not pipeline multiple callers +/// onto the same stream. Instead, we scale out the number of concurrent streams. This is primarily +/// to eliminate head-of-line blocking. See the module documentation for more details. /// -/// TODO: reap idle streams. /// TODO: consider making this generic over request and response types; not currently needed. pub struct StreamPool { /// The client pool to acquire clients from. Must be unbounded. client_pool: Arc, - /// All pooled streams. + /// Idle pooled streams. Acquired streams are removed from here and returned on drop. /// - /// Incoming requests will be sent over an existing stream with available capacity. If all - /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each - /// stream has an associated Tokio task that processes requests and responses. - streams: Arc>>, - /// The max number of concurrent streams, or None if unbounded. - max_streams: Option>, - /// The max number of concurrent requests per stream. - max_queue_depth: NonZero, - /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`. - /// None if the pool is unbounded. + /// The first stream in the map will be acquired next. The map is sorted by stream ID, which is + /// equivalent to the client ID and in turn sorted by its channel ID. This way we prefer + /// acquiring idle streams from lower-ordered channels, which allows us to free up and reap + /// higher-ordered channels. + idle: Mutex>, + /// Limits the max number of concurrent streams. None if the pool is unbounded. limiter: Option>, - /// Stream ID generator. - next_stream_id: AtomicUsize, + /// Reaps idle streams. + idle_reaper: Reaper, } -type StreamID = usize; -type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>; -type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>; -type ResponseSender = oneshot::Sender>; +/// The stream ID. Reuses the inner client ID. +type StreamID = ClientID; +/// A pooled stream. struct StreamEntry { - /// Sends caller requests to the stream task. The stream task exits when this is dropped. - sender: RequestSender, - /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on - /// completion without acquiring the `StreamPool::streams` lock. - queue_depth: Arc, + /// The bidirectional stream. + stream: BiStream, + /// The time when this stream was last used, i.e. when it was put back into `StreamPool::idle`. + idle_since: Instant, +} + +/// A bidirectional GetPage stream and its client. Can send requests and receive responses. +struct BiStream { + /// The owning client. Holds onto the channel slot while the stream is alive. + client: ClientGuard, + /// Stream for sending requests. Uses a watch channel, so it can only send a single request at a + /// time, and the caller must await the response before sending another request. This is + /// enforced by `StreamGuard::send`. + sender: watch::Sender, + /// Stream for receiving responses. + receiver: Pin> + Send>>, } impl StreamPool { - /// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth` - /// concurrent requests on each stream, and use up to `max_streams` concurrent streams. + /// Creates a new stream pool, using the given client pool. It will use up to `max_streams` + /// concurrent streams. /// /// The client pool must be unbounded. The stream pool will enforce its own limits, and because /// streams are long-lived they can cause persistent starvation if they exhaust the client pool. /// The stream pool should generally have its own dedicated client pool (but it can share a /// channel pool with others since these are always unbounded). - pub fn new( - client_pool: Arc, - max_streams: Option>, - max_queue_depth: NonZero, - ) -> Arc { + pub fn new(client_pool: Arc, max_streams: Option>) -> Arc { assert!(client_pool.limiter.is_none(), "bounded client pool"); - Arc::new(Self { + let pool = Arc::new(Self { client_pool, - streams: Arc::default(), - limiter: max_streams.map(|max_streams| { - Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get())) - }), - max_streams, - max_queue_depth, - next_stream_id: AtomicUsize::default(), - }) + idle: Mutex::default(), + limiter: max_streams.map(|max_streams| Arc::new(Semaphore::new(max_streams.get()))), + idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL), + }); + pool.idle_reaper.spawn(&pool); + pool } - /// Acquires an available stream from the pool, or spins up a new stream async if all streams - /// are full. Returns a guard that can be used to send a single request on the stream and await - /// the response, with queue depth quota already acquired. Blocks if the pool is at capacity - /// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight). + /// Acquires an available stream from the pool, or spins up a new stream if all streams are + /// full. Returns a guard that can be used to send requests and await the responses. Blocks if + /// the pool is full. /// /// This is very performance-sensitive, as it is on the GetPage hot path. /// - /// TODO: this must do something more sophisticated for performance. We want: - /// - /// * Cheap, concurrent access in the common case where we can use a pooled stream. - /// * Quick acquisition of pooled streams with available capacity. - /// * Prefer streams that belong to lower-numbered channels, to reap idle channels. - /// * Prefer filling up existing streams' queue depth before spinning up new streams. - /// * Don't hold a lock while spinning up new streams. - /// * Allow concurrent clients to join onto streams while they're spun up. - /// * Allow spinning up multiple streams concurrently, but don't overshoot limits. - /// - /// For now, we just do something simple and functional, but very inefficient (linear scan). - pub async fn get(&self) -> StreamGuard { + /// TODO: is a `Mutex` performant enough? Will it become too contended? We can't + /// trivially use e.g. DashMap or sharding, because we want to pop lower-ordered streams first + /// to free up higher-ordered channels. + pub async fn get(self: &Arc) -> tonic::Result { // Acquire a permit if the pool is bounded. let mut permit = None; if let Some(limiter) = self.limiter.clone() { permit = Some(limiter.acquire_owned().await.expect("never closed")); } - let mut streams = self.streams.lock().unwrap(); - // Look for a pooled stream with available capacity. - for entry in streams.values() { - assert!( - entry.queue_depth.load(Ordering::Relaxed) <= self.max_queue_depth.get(), - "stream queue overflow" - ); - if entry - .queue_depth - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| { - // Increment the queue depth via compare-and-swap. - // TODO: review ordering. - (queue_depth < self.max_queue_depth.get()).then_some(queue_depth + 1) - }) - .is_ok() - { - return StreamGuard { - sender: entry.sender.clone(), - queue_depth: entry.queue_depth.clone(), - permit, - }; - } + // Fast path: acquire an idle stream from the pool. + if let Some((_, entry)) = self.idle.lock().unwrap().pop_first() { + return Ok(StreamGuard { + pool: Arc::downgrade(self), + stream: Some(entry.stream), + can_reuse: true, + permit, + }); } - // No available stream, spin up a new one. We install the stream entry in the pool first and - // return the guard, while spinning up the stream task async. This allows other callers to - // join onto this stream and also create additional streams concurrently if this fills up. - let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed); - let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller - let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get()); - let entry = StreamEntry { - sender: req_tx.clone(), - queue_depth: queue_depth.clone(), - }; - streams.insert(id, entry); + // Spin up a new stream. Uses a watch channel to send a single request at a time, since + // `StreamGuard::send` enforces this anyway and it avoids unnecessary channel overhead. + let mut client = self.client_pool.get().await?; - if let Some(max_streams) = self.max_streams { - assert!(streams.len() <= max_streams.get(), "stream overflow"); - }; + let (req_tx, req_rx) = watch::channel(page_api::GetPageRequest::default()); + let req_stream = WatchStream::from_changes(req_rx); + let resp_stream = client.get_pages(req_stream).await?; - let client_pool = self.client_pool.clone(); - let streams = self.streams.clone(); - - tokio::spawn(async move { - if let Err(err) = Self::run_stream(client_pool, req_rx).await { - error!("stream failed: {err}"); - } - // Remove stream from pool on exit. - let entry = streams.lock().unwrap().remove(&id); - assert!(entry.is_some(), "unknown stream ID: {id}"); - }); - - StreamGuard { - sender: req_tx, - queue_depth, + Ok(StreamGuard { + pool: Arc::downgrade(self), + stream: Some(BiStream { + client, + sender: req_tx, + receiver: Box::pin(resp_stream), + }), + can_reuse: true, permit, - } - } - - /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a - /// bidirectional GetPage stream, then forwards requests and responses between callers and the - /// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be - /// atomic with pool stream acquisition. - /// - /// The task exits when the request channel is closed, or on a stream error. The caller is - /// responsible for removing the stream from the pool on exit. - async fn run_stream( - client_pool: Arc, - mut caller_rx: RequestReceiver, - ) -> anyhow::Result<()> { - // Acquire a client from the pool and create a stream. - let mut client = client_pool.get().await?; - - // NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could - // theoretically deadlock if both the client and server block on sends (since we're not - // reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and - // low queue depths, but it was seen to happen with the libpq protocol so better safe than - // sorry. It should never buffer more than the queue depth anyway, but using an unbounded - // channel guarantees that it will never block. - let (req_tx, req_rx) = mpsc::unbounded_channel(); - let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx); - let mut resp_stream = client.get_pages(req_stream).await?; - - // Track caller response channels by request ID. If the task returns early, these response - // channels will be dropped and the waiting callers will receive an error. - let mut callers = HashMap::new(); - - // Process requests and responses. - loop { - tokio::select! { - // Receive requests from callers and send them to the stream. - req = caller_rx.recv() => { - // Shut down if request channel is closed. - let Some((req, resp_tx)) = req else { - return Ok(()); - }; - - // Store the response channel by request ID. - if callers.contains_key(&req.request_id) { - // Error on request ID duplicates. Ignore callers that went away. - _ = resp_tx.send(Err(tonic::Status::invalid_argument( - format!("duplicate request ID: {}", req.request_id), - ))); - continue; - } - callers.insert(req.request_id, resp_tx); - - // Send the request on the stream. Bail out if the stream is closed. - req_tx.send(req).map_err(|_| { - tonic::Status::unavailable("stream closed") - })?; - } - - // Receive responses from the stream and send them to callers. - resp = resp_stream.next() => { - // Shut down if the stream is closed, and bail out on stream errors. - let Some(resp) = resp.transpose()? else { - return Ok(()) - }; - - // Send the response to the caller. Ignore errors if the caller went away. - let Some(resp_tx) = callers.remove(&resp.request_id) else { - warn!("received response for unknown request ID: {}", resp.request_id); - continue; - }; - _ = resp_tx.send(Ok(resp)); - } - } - } + }) } } -/// A pooled stream reference. Can be used to send a single request, to properly enforce queue -/// depth. Queue depth is already reserved and will be returned on drop. +impl Reapable for StreamPool { + /// Reaps streams that have been idle since before the cutoff. + fn reap_idle(&self, cutoff: Instant) { + self.idle + .lock() + .unwrap() + .retain(|_, entry| entry.idle_since >= cutoff); + } +} + +/// A stream acquired from the pool. Returned to the pool when dropped, unless there are still +/// in-flight requests on the stream, or the stream failed. pub struct StreamGuard { - sender: RequestSender, - queue_depth: Arc, + pool: Weak, + stream: Option, // Some until dropped + can_reuse: bool, // returned to pool if true permit: Option, // None if pool is unbounded } impl StreamGuard { - /// Sends a request on the stream and awaits the response. Consumes the guard, since it's only - /// valid for a single request (to enforce queue depth). This also drops the guard on return and - /// returns the queue depth quota to the pool. + /// Sends a request on the stream and awaits the response. If the future is dropped before it + /// resolves (e.g. due to a timeout or cancellation), the stream will be closed to cancel the + /// request and is not returned to the pool. The same is true if the stream errors, in which + /// case the caller can't send further requests on the stream. /// - /// The `GetPageRequest::request_id` must be unique across in-flight requests. + /// We only support sending a single request at a time, to eliminate head-of-line blocking. See + /// module documentation for details. /// /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status` /// to avoid tearing down the stream for per-request errors. Callers must check this. pub async fn send( - self, + &mut self, req: page_api::GetPageRequest, ) -> tonic::Result { - let (resp_tx, resp_rx) = oneshot::channel(); + let req_id = req.request_id; + let stream = self.stream.as_mut().expect("not dropped"); - self.sender - .send((req, resp_tx)) - .await + // Mark the stream as not reusable while the request is in flight. We can't return the + // stream to the pool until we receive the response, to avoid head-of-line blocking and + // stale responses. Failed streams can't be reused either. + if !self.can_reuse { + return Err(tonic::Status::internal("stream can't be reused")); + } + self.can_reuse = false; + + // Send the request and receive the response. + // + // NB: this uses a watch channel, so it's unsafe to change this code to pipeline requests. + stream + .sender + .send(req) .map_err(|_| tonic::Status::unavailable("stream closed"))?; - resp_rx + let resp = stream + .receiver + .next() .await - .map_err(|_| tonic::Status::unavailable("stream closed"))? + .ok_or_else(|| tonic::Status::unavailable("stream closed"))??; + + if resp.request_id != req_id { + return Err(tonic::Status::internal(format!( + "response ID {} does not match request ID {}", + resp.request_id, req_id + ))); + } + + // Success, mark the stream as reusable. + self.can_reuse = true; + + Ok(resp) } } impl Drop for StreamGuard { fn drop(&mut self) { - // Release the queue depth reservation on drop. This can prematurely decrement it if dropped - // before the response is received, but that's okay. - let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst); - assert!(prev_queue_depth > 0, "stream queue underflow"); + let Some(pool) = self.pool.upgrade() else { + return; // pool was dropped + }; + + // If the stream isn't reusable, it can't be returned to the pool. + if !self.can_reuse { + return; + } + + // Place the idle stream back into the pool. + let entry = StreamEntry { + stream: self.stream.take().expect("dropped once"), + idle_since: Instant::now(), + }; + pool.idle + .lock() + .unwrap() + .insert(entry.stream.client.id, entry); _ = self.permit; // returned on drop, referenced for visibility } } + +/// Periodically reaps idle resources from a pool. +struct Reaper { + /// The task check interval. + interval: Duration, + /// The threshold for reaping idle resources. + threshold: Duration, + /// Cancels the reaper task. Cancelled when the reaper is dropped. + cancel: CancellationToken, +} + +impl Reaper { + /// Creates a new reaper. + pub fn new(threshold: Duration, interval: Duration) -> Self { + Self { + cancel: CancellationToken::new(), + threshold, + interval, + } + } + + /// Spawns a task to periodically reap idle resources from the given task pool. The task is + /// cancelled when the reaper is dropped. + pub fn spawn(&self, pool: &Arc) { + // NB: hold a weak pool reference, otherwise the task will prevent dropping the pool. + let pool = Arc::downgrade(pool); + let cancel = self.cancel.clone(); + let (interval, threshold) = (self.interval, self.threshold); + + tokio::spawn(async move { + loop { + tokio::select! { + _ = tokio::time::sleep(interval) => { + let Some(pool) = pool.upgrade() else { + return; // pool was dropped + }; + pool.reap_idle(Instant::now() - threshold); + } + + _ = cancel.cancelled() => return, + } + } + }); + } +} + +impl Drop for Reaper { + fn drop(&mut self) { + self.cancel.cancel(); // cancel reaper task + } +} + +/// A reapable resource pool. +trait Reapable: Send + Sync + 'static { + /// Reaps resources that have been idle since before the given cutoff. + fn reap_idle(&self, cutoff: Instant); +} diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs index b0473204d7..8a138711e8 100644 --- a/pageserver/client_grpc/src/retry.rs +++ b/pageserver/client_grpc/src/retry.rs @@ -1,5 +1,6 @@ use std::time::Duration; +use futures::future::pending; use tokio::time::Instant; use tracing::{error, info, warn}; @@ -8,60 +9,54 @@ use utils::backoff::exponential_backoff_duration; /// A retry handler for Pageserver gRPC requests. /// /// This is used instead of backoff::retry for better control and observability. -pub struct Retry; +pub struct Retry { + /// Timeout across all retry attempts. If None, retries forever. + pub timeout: Option, + /// The initial backoff duration. The first retry does not use a backoff. + pub base_backoff: Duration, + /// The maximum backoff duration. + pub max_backoff: Duration, +} impl Retry { - /// The per-request timeout. - // TODO: tune these, and/or make them configurable. Should we retry forever? - const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); - /// The total timeout across all attempts - const TOTAL_TIMEOUT: Duration = Duration::from_secs(60); - /// The initial backoff duration. - const BASE_BACKOFF: Duration = Duration::from_millis(10); - /// The maximum backoff duration. - const MAX_BACKOFF: Duration = Duration::from_secs(10); - /// If true, log successful requests. For debugging. - const LOG_SUCCESS: bool = false; - /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors, /// using the current tracing span for context. /// - /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default - /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`]. + /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. pub async fn with(&self, mut f: F) -> tonic::Result where - F: FnMut() -> O, + F: FnMut(usize) -> O, // pass attempt number, starting at 0 O: Future>, { let started = Instant::now(); - let deadline = started + Self::TOTAL_TIMEOUT; + let deadline = self.timeout.map(|timeout| started + timeout); let mut last_error = None; let mut retries = 0; loop { - // Set up a future to wait for the backoff (if any) and run the request with a timeout. + // Set up a future to wait for the backoff, if any, and run the closure. let backoff_and_try = async { // NB: sleep() always sleeps 1ms, even when given a 0 argument. See: // https://github.com/tokio-rs/tokio/issues/6866 - if let Some(backoff) = Self::backoff_duration(retries) { + if let Some(backoff) = self.backoff_duration(retries) { tokio::time::sleep(backoff).await; } - let request_started = Instant::now(); - tokio::time::timeout(Self::REQUEST_TIMEOUT, f()) - .await - .map_err(|_| { - tonic::Status::deadline_exceeded(format!( - "request timed out after {:.3}s", - request_started.elapsed().as_secs_f64() - )) - })? + f(retries).await }; - // Wait for the backoff and request, or bail out if the total timeout is exceeded. + // Set up a future for the timeout, if any. + let timeout = async { + match deadline { + Some(deadline) => tokio::time::sleep_until(deadline).await, + None => pending().await, + } + }; + + // Wait for the backoff and request, or bail out if the timeout is exceeded. let result = tokio::select! { result = backoff_and_try => result, - _ = tokio::time::sleep_until(deadline) => { + _ = timeout => { let last_error = last_error.unwrap_or_else(|| { tonic::Status::deadline_exceeded(format!( "request timed out after {:.3}s", @@ -79,7 +74,7 @@ impl Retry { match result { // Success, return the result. Ok(result) => { - if retries > 0 || Self::LOG_SUCCESS { + if retries > 0 { info!( "request succeeded after {retries} retries in {:.3}s", started.elapsed().as_secs_f64(), @@ -112,12 +107,13 @@ impl Retry { } } - /// Returns the backoff duration for the given retry attempt, or None for no backoff. - fn backoff_duration(retry: usize) -> Option { + /// Returns the backoff duration for the given retry attempt, or None for no backoff. The first + /// attempt and first retry never backs off, so this returns None for 0 and 1 retries. + fn backoff_duration(&self, retries: usize) -> Option { let backoff = exponential_backoff_duration( - retry as u32, - Self::BASE_BACKOFF.as_secs_f64(), - Self::MAX_BACKOFF.as_secs_f64(), + (retries as u32).saturating_sub(1), // first retry does not back off + self.base_backoff.as_secs_f64(), + self.max_backoff.as_secs_f64(), ); (!backoff.is_zero()).then_some(backoff) } @@ -131,7 +127,6 @@ impl Retry { tonic::Code::Aborted => true, tonic::Code::Cancelled => true, tonic::Code::DeadlineExceeded => true, // maybe transient slowness - tonic::Code::Internal => true, // maybe transient failure? tonic::Code::ResourceExhausted => true, tonic::Code::Unavailable => true, @@ -139,6 +134,10 @@ impl Retry { tonic::Code::AlreadyExists => false, tonic::Code::DataLoss => false, tonic::Code::FailedPrecondition => false, + // NB: don't retry Internal. It is intended for serious errors such as invariant + // violations, and is also used for client-side invariant checks that would otherwise + // result in retry loops. + tonic::Code::Internal => false, tonic::Code::InvalidArgument => false, tonic::Code::NotFound => false, tonic::Code::OutOfRange => false, diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs index 5bbcaab393..b7539b900c 100644 --- a/pageserver/client_grpc/src/split.rs +++ b/pageserver/client_grpc/src/split.rs @@ -5,27 +5,24 @@ use bytes::Bytes; use pageserver_api::key::rel_block_to_key; use pageserver_api::shard::{ShardStripeSize, key_to_shard_number}; use pageserver_page_api as page_api; -use utils::shard::{ShardCount, ShardIndex}; +use utils::shard::{ShardCount, ShardIndex, ShardNumber}; /// Splits GetPageRequests that straddle shard boundaries and assembles the responses. /// TODO: add tests for this. pub struct GetPageSplitter { - /// The original request ID. Used for all shard requests. - request_id: page_api::RequestID, /// Split requests by shard index. requests: HashMap, - /// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble - /// the response pages in the same order as the original request. + /// The response being assembled. Preallocated with empty pages, to be filled in. + response: page_api::GetPageResponse, + /// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used + /// to assemble the response pages in the same order as the original request. block_shards: Vec, - /// Page responses by shard index. Will be assembled into a single response. - responses: HashMap>, } impl GetPageSplitter { /// Checks if the given request only touches a single shard, and returns the shard ID. This is /// the common case, so we check first in order to avoid unnecessary allocations and overhead. - /// The caller must ensure that the request has at least one block number, or this will panic. - pub fn is_single_shard( + pub fn for_single_shard( req: &page_api::GetPageRequest, count: ShardCount, stripe_size: ShardStripeSize, @@ -35,8 +32,12 @@ impl GetPageSplitter { return Some(ShardIndex::unsharded()); } - // Find the base shard index for the first page, and compare with the rest. - let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages")); + // Find the first page's shard, for comparison. If there are no pages, just return the first + // shard (caller likely checked already, otherwise the server will reject it). + let Some(&first_page) = req.block_numbers.first() else { + return Some(ShardIndex::new(ShardNumber(0), count)); + }; + let key = rel_block_to_key(req.rel, first_page); let shard_number = key_to_shard_number(count, stripe_size, &key); req.block_numbers @@ -57,19 +58,19 @@ impl GetPageSplitter { ) -> Self { // The caller should make sure we don't split requests unnecessarily. debug_assert!( - Self::is_single_shard(&req, count, stripe_size).is_none(), + Self::for_single_shard(&req, count, stripe_size).is_none(), "unnecessary request split" ); // Split the requests by shard index. let mut requests = HashMap::with_capacity(2); // common case let mut block_shards = Vec::with_capacity(req.block_numbers.len()); - for blkno in req.block_numbers { + for &blkno in &req.block_numbers { let key = rel_block_to_key(req.rel, blkno); let shard_number = key_to_shard_number(count, stripe_size, &key); let shard_id = ShardIndex::new(shard_number, count); - let shard_req = requests + requests .entry(shard_id) .or_insert_with(|| page_api::GetPageRequest { request_id: req.request_id, @@ -77,27 +78,47 @@ impl GetPageSplitter { rel: req.rel, read_lsn: req.read_lsn, block_numbers: Vec::new(), - }); - shard_req.block_numbers.push(blkno); + }) + .block_numbers + .push(blkno); block_shards.push(shard_id); } - Self { + // Construct a response to be populated by shard responses. Preallocate empty page slots + // with the expected block numbers. + let response = page_api::GetPageResponse { request_id: req.request_id, - responses: HashMap::with_capacity(requests.len()), + status_code: page_api::GetPageStatusCode::Ok, + reason: None, + rel: req.rel, + pages: req + .block_numbers + .into_iter() + .map(|block_number| { + page_api::Page { + block_number, + image: Bytes::new(), // empty page slot to be filled in + } + }) + .collect(), + }; + + Self { requests, + response, block_shards, } } - /// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations. + /// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations. pub fn drain_requests( &mut self, ) -> impl Iterator { self.requests.drain() } - /// Adds a response from the given shard. + /// Adds a response from the given shard. The response must match the request ID and have an OK + /// status code. A response must not already exist for the given shard ID. #[allow(clippy::result_large_err)] pub fn add_response( &mut self, @@ -105,68 +126,84 @@ impl GetPageSplitter { response: page_api::GetPageResponse, ) -> tonic::Result<()> { // The caller should already have converted status codes into tonic::Status. - assert_eq!(response.status_code, page_api::GetPageStatusCode::Ok); - - // Make sure the response matches the request ID. - if response.request_id != self.request_id { + if response.status_code != page_api::GetPageStatusCode::Ok { return Err(tonic::Status::internal(format!( - "response ID {} does not match request ID {}", - response.request_id, self.request_id + "unexpected non-OK response for shard {shard_id}: {} {}", + response.status_code, + response.reason.unwrap_or_default() ))); } - // Add the response data to the map. - let old = self.responses.insert(shard_id, response.page_images); - - if old.is_some() { + if response.request_id != self.response.request_id { return Err(tonic::Status::internal(format!( - "duplicate response for shard {shard_id}", + "response ID mismatch for shard {shard_id}: expected {}, got {}", + self.response.request_id, response.request_id + ))); + } + + // Place the shard response pages into the assembled response, in request order. + let mut pages = response.pages.into_iter(); + + for (i, &s) in self.block_shards.iter().enumerate() { + if shard_id != s { + continue; + } + + let Some(slot) = self.response.pages.get_mut(i) else { + return Err(tonic::Status::internal(format!( + "no block_shards slot {i} for shard {shard_id}" + ))); + }; + let Some(page) = pages.next() else { + return Err(tonic::Status::internal(format!( + "missing page {} in shard {shard_id} response", + slot.block_number + ))); + }; + if page.block_number != slot.block_number { + return Err(tonic::Status::internal(format!( + "shard {shard_id} returned wrong page at index {i}, expected {} got {}", + slot.block_number, page.block_number + ))); + } + if !slot.image.is_empty() { + return Err(tonic::Status::internal(format!( + "shard {shard_id} returned duplicate page {} at index {i}", + slot.block_number + ))); + } + + *slot = page; + } + + // Make sure we've consumed all pages from the shard response. + if let Some(extra_page) = pages.next() { + return Err(tonic::Status::internal(format!( + "shard {shard_id} returned extra page: {}", + extra_page.block_number ))); } Ok(()) } - /// Assembles the shard responses into a single response. Responses must be present for all - /// relevant shards, and the total number of pages must match the original request. + /// Fetches the final, assembled response. #[allow(clippy::result_large_err)] - pub fn assemble_response(self) -> tonic::Result { - let mut response = page_api::GetPageResponse { - request_id: self.request_id, - status_code: page_api::GetPageStatusCode::Ok, - reason: None, - page_images: Vec::with_capacity(self.block_shards.len()), - }; - - // Set up per-shard page iterators we can pull from. - let mut shard_responses = HashMap::with_capacity(self.responses.len()); - for (shard_id, responses) in self.responses { - shard_responses.insert(shard_id, responses.into_iter()); - } - - // Reassemble the responses in the same order as the original request. - for shard_id in &self.block_shards { - let page = shard_responses - .get_mut(shard_id) - .ok_or_else(|| { - tonic::Status::internal(format!("missing response for shard {shard_id}")) - })? - .next() - .ok_or_else(|| { - tonic::Status::internal(format!("missing page from shard {shard_id}")) - })?; - response.page_images.push(page); - } - - // Make sure there are no additional pages. - for (shard_id, mut pages) in shard_responses { - if pages.next().is_some() { + pub fn get_response(self) -> tonic::Result { + // Check that the response is complete. + for (i, page) in self.response.pages.iter().enumerate() { + if page.image.is_empty() { return Err(tonic::Status::internal(format!( - "extra pages returned from shard {shard_id}" + "missing page {} for shard {}", + page.block_number, + self.block_shards + .get(i) + .map(|s| s.to_string()) + .unwrap_or_else(|| "?".to_string()) ))); } } - Ok(response) + Ok(self.response) } } diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index 7b70f0dc87..ba34fa1f69 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -17,6 +17,7 @@ pageserver = { path = ".." } pageserver_api.workspace = true remote_storage = { path = "../../libs/remote_storage" } postgres_ffi.workspace = true +serde.workspace = true thiserror.workspace = true tokio.workspace = true tokio-util.workspace = true diff --git a/pageserver/ctl/src/download_remote_object.rs b/pageserver/ctl/src/download_remote_object.rs new file mode 100644 index 0000000000..aa09774701 --- /dev/null +++ b/pageserver/ctl/src/download_remote_object.rs @@ -0,0 +1,85 @@ +use camino::Utf8PathBuf; +use clap::Parser; +use tokio_util::sync::CancellationToken; + +/// Download a specific object from remote storage to a local file. +/// +/// The remote storage configuration is supplied via the `REMOTE_STORAGE_CONFIG` environment +/// variable, in the same TOML format that the pageserver itself understands. This allows the +/// command to work with any cloud supported by the `remote_storage` crate (currently AWS S3, +/// Azure Blob Storage and local files), as long as the credentials are available via the +/// standard environment variables expected by the underlying SDKs. +/// +/// Examples for setting the environment variable: +/// +/// ```bash +/// # AWS S3 (region can also be provided via AWS_REGION) +/// export REMOTE_STORAGE_CONFIG='remote_storage = { bucket_name = "my-bucket", bucket_region = "us-east-2" }' +/// +/// # Azure Blob Storage (account key picked up from AZURE_STORAGE_ACCOUNT_KEY) +/// export REMOTE_STORAGE_CONFIG='remote_storage = { container = "my-container", account = "my-account" }' +/// ``` +#[derive(Parser)] +pub(crate) struct DownloadRemoteObjectCmd { + /// Key / path of the object to download (relative to the remote storage prefix). + /// + /// Examples: + /// "wal/3aa8f.../00000001000000000000000A" + /// "pageserver/v1/tenants//timelines//layer_12345" + pub remote_path: String, + + /// Path of the local file to create. Existing file will be overwritten. + /// + /// Examples: + /// "./segment" + /// "/tmp/layer_12345.parquet" + pub output_file: Utf8PathBuf, +} + +pub(crate) async fn main(cmd: &DownloadRemoteObjectCmd) -> anyhow::Result<()> { + use remote_storage::{DownloadOpts, GenericRemoteStorage, RemotePath, RemoteStorageConfig}; + + // Fetch remote storage configuration from the environment + let config_str = std::env::var("REMOTE_STORAGE_CONFIG").map_err(|_| { + anyhow::anyhow!( + "'REMOTE_STORAGE_CONFIG' environment variable must be set to a valid remote storage TOML config" + ) + })?; + + let config = RemoteStorageConfig::from_toml_str(&config_str)?; + + // Initialise remote storage client + let storage = GenericRemoteStorage::from_config(&config).await?; + + // RemotePath must be relative – leading slashes confuse the parser. + let remote_path_str = cmd.remote_path.trim_start_matches('/'); + let remote_path = RemotePath::from_string(remote_path_str)?; + + let cancel = CancellationToken::new(); + + println!( + "Downloading '{remote_path}' from remote storage bucket {:?} ...", + config.storage.bucket_name() + ); + + // Start the actual download + let download = storage + .download(&remote_path, &DownloadOpts::default(), &cancel) + .await?; + + // Stream to file + let mut reader = tokio_util::io::StreamReader::new(download.download_stream); + let tmp_path = cmd.output_file.with_extension("tmp"); + let mut file = tokio::fs::File::create(&tmp_path).await?; + tokio::io::copy(&mut reader, &mut file).await?; + file.sync_all().await?; + // Atomically move into place + tokio::fs::rename(&tmp_path, &cmd.output_file).await?; + + println!( + "Downloaded to '{}'. Last modified: {:?}, etag: {}", + cmd.output_file, download.last_modified, download.etag + ); + + Ok(()) +} diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 838d00e490..9801f3c9dc 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -1,14 +1,16 @@ use std::str::FromStr; -use anyhow::Context; +use anyhow::{Context, Ok}; use camino::Utf8PathBuf; use pageserver::tenant::{ IndexPart, layer_map::{LayerMap, SearchResult}, - remote_timeline_client::remote_layer_path, - storage_layer::{PersistentLayerDesc, ReadableLayerWeak}, + remote_timeline_client::{index::LayerFileMetadata, remote_layer_path}, + storage_layer::{LayerName, LayerVisibilityHint, PersistentLayerDesc, ReadableLayerWeak}, }; use pageserver_api::key::Key; +use serde::Serialize; +use std::collections::BTreeMap; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, @@ -33,6 +35,31 @@ pub(crate) enum IndexPartCmd { #[arg(long)] lsn: String, }, + /// List all visible delta and image layers at the latest LSN. + ListVisibleLayers { + #[arg(long)] + path: Utf8PathBuf, + }, +} + +fn create_layer_map_from_index_part( + index_part: &IndexPart, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, +) -> LayerMap { + let mut layer_map = LayerMap::default(); + { + let mut updates = layer_map.batch_update(); + for (key, value) in index_part.layer_metadata.iter() { + updates.insert_historic(PersistentLayerDesc::from_filename( + tenant_shard_id, + timeline_id, + key.clone(), + value.file_size, + )); + } + } + layer_map } async fn search_layers( @@ -49,18 +76,7 @@ async fn search_layers( let bytes = tokio::fs::read(path).await?; IndexPart::from_json_bytes(&bytes).unwrap() }; - let mut layer_map = LayerMap::default(); - { - let mut updates = layer_map.batch_update(); - for (key, value) in index_json.layer_metadata.iter() { - updates.insert_historic(PersistentLayerDesc::from_filename( - tenant_shard_id, - timeline_id, - key.clone(), - value.file_size, - )); - } - } + let layer_map = create_layer_map_from_index_part(&index_json, tenant_shard_id, timeline_id); let key = Key::from_hex(key)?; let lsn = Lsn::from_str(lsn).unwrap(); @@ -98,6 +114,69 @@ async fn search_layers( Ok(()) } +#[derive(Debug, Clone, Serialize)] +struct VisibleLayers { + pub total_images: u64, + pub total_image_bytes: u64, + pub total_deltas: u64, + pub total_delta_bytes: u64, + pub layer_metadata: BTreeMap, +} + +impl VisibleLayers { + pub fn new() -> Self { + Self { + layer_metadata: BTreeMap::new(), + total_images: 0, + total_image_bytes: 0, + total_deltas: 0, + total_delta_bytes: 0, + } + } + + pub fn add_layer(&mut self, name: LayerName, layer: LayerFileMetadata) { + match name { + LayerName::Image(_) => { + self.total_images += 1; + self.total_image_bytes += layer.file_size; + } + LayerName::Delta(_) => { + self.total_deltas += 1; + self.total_delta_bytes += layer.file_size; + } + } + self.layer_metadata.insert(name, layer); + } +} + +async fn list_visible_layers(path: &Utf8PathBuf) -> anyhow::Result<()> { + let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let timeline_id = TimelineId::generate(); + + let bytes = tokio::fs::read(path).await.context("read file")?; + let index_part = IndexPart::from_json_bytes(&bytes).context("deserialize")?; + let layer_map = create_layer_map_from_index_part(&index_part, tenant_shard_id, timeline_id); + let mut visible_layers = VisibleLayers::new(); + let (layers, _key_space) = layer_map.get_visibility(Vec::new()); + for (layer, visibility) in layers { + if visibility == LayerVisibilityHint::Visible { + visible_layers.add_layer( + layer.layer_name(), + index_part + .layer_metadata + .get(&layer.layer_name()) + .unwrap() + .clone(), + ); + } + } + let output = serde_json::to_string_pretty(&visible_layers).context("serialize output")?; + println!("{output}"); + + Ok(()) +} + pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { match cmd { IndexPartCmd::Dump { path } => { @@ -114,5 +193,6 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { key, lsn, } => search_layers(tenant_id, timeline_id, path, key, lsn).await, + IndexPartCmd::ListVisibleLayers { path } => list_visible_layers(path).await, } } diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 3cd4faaf2e..e84ad2c87f 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -4,6 +4,7 @@ //! //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file. +mod download_remote_object; mod draw_timeline_dir; mod index_part; mod key; @@ -16,6 +17,7 @@ use std::time::{Duration, SystemTime}; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; +use download_remote_object::DownloadRemoteObjectCmd; use index_part::IndexPartCmd; use layers::LayerCmd; use page_trace::PageTraceCmd; @@ -63,6 +65,7 @@ enum Commands { /// Debug print a hex key found from logs Key(key::DescribeKeyCommand), PageTrace(PageTraceCmd), + DownloadRemoteObject(DownloadRemoteObjectCmd), } /// Read and update pageserver metadata file @@ -185,6 +188,9 @@ async fn main() -> anyhow::Result<()> { } Commands::Key(dkc) => dkc.execute(), Commands::PageTrace(cmd) => page_trace::main(&cmd)?, + Commands::DownloadRemoteObject(cmd) => { + download_remote_object::main(&cmd).await?; + } }; Ok(()) } diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index 1d6c230916..d113a04a42 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -153,7 +153,7 @@ message GetDbSizeResponse { message GetPageRequest { // A request ID. Will be included in the response. Should be unique for // in-flight requests on the stream. - uint64 request_id = 1; + RequestID request_id = 1; // The request class. GetPageClass request_class = 2; // The LSN to read at. @@ -177,6 +177,14 @@ message GetPageRequest { repeated uint32 block_number = 5; } +// A Request ID. Should be unique for in-flight requests on a stream. Included in the response. +message RequestID { + // The base request ID. + uint64 id = 1; + // The request attempt. Starts at 0, incremented on each retry. + uint32 attempt = 2; +} + // A GetPageRequest class. Primarily intended for observability, but may also be // used for prioritization in the future. enum GetPageClass { @@ -199,13 +207,26 @@ enum GetPageClass { // the entire batch is ready, so no one can make use of the individual pages. message GetPageResponse { // The original request's ID. - uint64 request_id = 1; - // The response status code. + RequestID request_id = 1; + // The response status code. If not OK, the rel and page fields will be empty. GetPageStatusCode status_code = 2; // A string describing the status, if any. string reason = 3; - // The 8KB page images, in the same order as the request. Empty if status_code != OK. - repeated bytes page_image = 4; + // The relation that the pages belong to. + RelTag rel = 4; + // The page(s), in the same order as the request. + repeated Page page = 5; +} + +// A page. +// +// TODO: it would be slightly more efficient (but less convenient) to have separate arrays of block +// numbers and images, but given the 8KB page size it's probably negligible. Benchmark it anyway. +message Page { + // The page number. + uint32 block_number = 1; + // The materialized page image, as an 8KB byte vector. + bytes image = 2; } // A GetPageResponse status code. diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 6523d00d3d..f70d0e7b28 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -1,4 +1,5 @@ use anyhow::Context as _; +use futures::future::ready; use futures::{Stream, StreamExt as _, TryStreamExt as _}; use tokio::io::AsyncRead; use tokio_util::io::StreamReader; @@ -110,7 +111,7 @@ impl Client { ) -> tonic::Result> + Send + 'static> { let reqs = reqs.map(proto::GetPageRequest::from); let resps = self.inner.get_pages(reqs).await?.into_inner(); - Ok(resps.map_ok(GetPageResponse::from)) + Ok(resps.and_then(|resp| ready(GetPageResponse::try_from(resp).map_err(|err| err.into())))) } /// Returns the size of a relation, as # of blocks. diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index d0d3517d41..a3286ecf15 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -49,7 +49,7 @@ impl From for tonic::Status { } /// The LSN a request should read at. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Default)] pub struct ReadLsn { /// The request's read LSN. pub request_lsn: Lsn, @@ -329,7 +329,7 @@ impl From for proto::GetDbSizeResponse { } /// Requests one or more pages. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Default)] pub struct GetPageRequest { /// A request ID. Will be included in the response. Should be unique for in-flight requests on /// the stream. @@ -356,7 +356,10 @@ impl TryFrom for GetPageRequest { return Err(ProtocolError::Missing("block_number")); } Ok(Self { - request_id: pb.request_id, + request_id: pb + .request_id + .ok_or(ProtocolError::Missing("request_id"))? + .into(), request_class: pb.request_class.into(), read_lsn: pb .read_lsn @@ -371,7 +374,7 @@ impl TryFrom for GetPageRequest { impl From for proto::GetPageRequest { fn from(request: GetPageRequest) -> Self { Self { - request_id: request.request_id, + request_id: Some(request.request_id.into()), request_class: request.request_class.into(), read_lsn: Some(request.read_lsn.into()), rel: Some(request.rel.into()), @@ -380,16 +383,60 @@ impl From for proto::GetPageRequest { } } -/// A GetPage request ID. -pub type RequestID = u64; +/// A GetPage request ID and retry attempt. Should be unique for in-flight requests on a stream. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct RequestID { + /// The base request ID. + pub id: u64, + // The request attempt. Starts at 0, incremented on each retry. + pub attempt: u32, +} + +impl RequestID { + /// Creates a new RequestID with the given ID and an initial attempt of 0. + pub fn new(id: u64) -> Self { + Self { id, attempt: 0 } + } +} + +impl Display for RequestID { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}.{}", self.id, self.attempt) + } +} + +impl From for RequestID { + fn from(pb: proto::RequestId) -> Self { + Self { + id: pb.id, + attempt: pb.attempt, + } + } +} + +impl From for RequestID { + fn from(id: u64) -> Self { + Self::new(id) + } +} + +impl From for proto::RequestId { + fn from(request_id: RequestID) -> Self { + Self { + id: request_id.id, + attempt: request_id.attempt, + } + } +} /// A GetPage request class. -#[derive(Clone, Copy, Debug, strum_macros::Display)] +#[derive(Clone, Copy, Debug, Default, strum_macros::Display)] pub enum GetPageClass { /// Unknown class. For backwards compatibility: used when an older client version sends a class /// that a newer server version has removed. Unknown, /// A normal request. This is the default. + #[default] Normal, /// A prefetch request. NB: can only be classified on pg < 18. Prefetch, @@ -397,19 +444,6 @@ pub enum GetPageClass { Background, } -impl GetPageClass { - /// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than - /// latency-sensitive). - pub fn is_bulk(&self) -> bool { - match self { - Self::Unknown => false, - Self::Normal => false, - Self::Prefetch => true, - Self::Background => true, - } - } -} - impl From for GetPageClass { fn from(pb: proto::GetPageClass) -> Self { match pb { @@ -456,32 +490,41 @@ impl From for i32 { pub struct GetPageResponse { /// The original request's ID. pub request_id: RequestID, - /// The response status code. + /// The response status code. If not OK, the `rel` and `pages` fields will be empty. pub status_code: GetPageStatusCode, /// A string describing the status, if any. pub reason: Option, - /// The 8KB page images, in the same order as the request. Empty if status != OK. - pub page_images: Vec, + /// The relation that the pages belong to. + pub rel: RelTag, + // The page(s), in the same order as the request. + pub pages: Vec, } -impl From for GetPageResponse { - fn from(pb: proto::GetPageResponse) -> Self { - Self { - request_id: pb.request_id, +impl TryFrom for GetPageResponse { + type Error = ProtocolError; + + fn try_from(pb: proto::GetPageResponse) -> Result { + Ok(Self { + request_id: pb + .request_id + .ok_or(ProtocolError::Missing("request_id"))? + .into(), status_code: pb.status_code.into(), reason: Some(pb.reason).filter(|r| !r.is_empty()), - page_images: pb.page_image, - } + rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?, + pages: pb.page.into_iter().map(Page::from).collect(), + }) } } impl From for proto::GetPageResponse { fn from(response: GetPageResponse) -> Self { Self { - request_id: response.request_id, + request_id: Some(response.request_id.into()), status_code: response.status_code.into(), reason: response.reason.unwrap_or_default(), - page_image: response.page_images, + rel: Some(response.rel.into()), + page: response.pages.into_iter().map(proto::Page::from).collect(), } } } @@ -514,11 +557,39 @@ impl GetPageResponse { request_id, status_code, reason: Some(status.message().to_string()), - page_images: Vec::new(), + rel: RelTag::default(), + pages: Vec::new(), }) } } +// A page. +#[derive(Clone, Debug)] +pub struct Page { + /// The page number. + pub block_number: u32, + /// The materialized page image, as an 8KB byte vector. + pub image: Bytes, +} + +impl From for Page { + fn from(pb: proto::Page) -> Self { + Self { + block_number: pb.block_number, + image: pb.image, + } + } +} + +impl From for proto::Page { + fn from(page: Page) -> Self { + Self { + block_number: page.block_number, + image: page.image, + } + } +} + /// A GetPage response status code. /// /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml index f5dfc0db25..609fef2b4f 100644 --- a/pageserver/pagebench/Cargo.toml +++ b/pageserver/pagebench/Cargo.toml @@ -16,6 +16,7 @@ futures.workspace = true hdrhistogram.workspace = true humantime.workspace = true humantime-serde.workspace = true +pprof.workspace = true rand.workspace = true reqwest.workspace = true serde.workspace = true @@ -27,8 +28,9 @@ tokio-util.workspace = true tonic.workspace = true url.workspace = true -pageserver_client.workspace = true pageserver_api.workspace = true +pageserver_client.workspace = true +pageserver_client_grpc.workspace = true pageserver_page_api.workspace = true utils = { path = "../../libs/utils/" } workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index f14caf548c..30b30d36f6 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -10,12 +10,14 @@ use anyhow::Context; use async_trait::async_trait; use bytes::Bytes; use camino::Utf8PathBuf; +use futures::stream::FuturesUnordered; use futures::{Stream, StreamExt as _}; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest}; use pageserver_api::reltag::RelTag; use pageserver_api::shard::TenantShardId; +use pageserver_client_grpc::{self as client_grpc, ShardSpec}; use pageserver_page_api as page_api; use rand::prelude::*; use tokio::task::JoinSet; @@ -37,6 +39,10 @@ pub(crate) struct Args { /// Pageserver connection string. Supports postgresql:// and grpc:// protocols. #[clap(long, default_value = "postgres://postgres@localhost:64000")] page_service_connstring: String, + /// Use the rich gRPC Pageserver client `client_grpc::PageserverClient`, rather than the basic + /// no-frills `page_api::Client`. Only valid with grpc:// connstrings. + #[clap(long)] + rich_client: bool, #[clap(long)] pageserver_jwt: Option, #[clap(long, default_value = "1")] @@ -332,6 +338,7 @@ async fn main_impl( let client: Box = match scheme.as_str() { "postgresql" | "postgres" => { assert!(!args.compression, "libpq does not support compression"); + assert!(!args.rich_client, "rich client requires grpc://"); Box::new( LibpqClient::new(&args.page_service_connstring, worker_id.timeline) .await @@ -339,6 +346,16 @@ async fn main_impl( ) } + "grpc" if args.rich_client => Box::new( + RichGrpcClient::new( + &args.page_service_connstring, + worker_id.timeline, + args.compression, + ) + .await + .unwrap(), + ), + "grpc" => Box::new( GrpcClient::new( &args.page_service_connstring, @@ -657,7 +674,7 @@ impl Client for GrpcClient { blks: Vec, ) -> anyhow::Result<()> { let req = page_api::GetPageRequest { - request_id: req_id, + request_id: req_id.into(), request_class: page_api::GetPageClass::Normal, read_lsn: page_api::ReadLsn { request_lsn: req_lsn, @@ -677,6 +694,79 @@ impl Client for GrpcClient { "unexpected status code: {}", resp.status_code, ); - Ok((resp.request_id, resp.page_images)) + Ok(( + resp.request_id.id, + resp.pages.into_iter().map(|p| p.image).collect(), + )) + } +} + +/// A rich gRPC Pageserver client. +struct RichGrpcClient { + inner: Arc, + requests: FuturesUnordered< + Pin> + Send>>, + >, +} + +impl RichGrpcClient { + async fn new( + connstring: &str, + ttid: TenantTimelineId, + compression: bool, + ) -> anyhow::Result { + let inner = Arc::new(client_grpc::PageserverClient::new( + ttid.tenant_id, + ttid.timeline_id, + ShardSpec::new( + [(ShardIndex::unsharded(), connstring.to_string())].into(), + None, + )?, + None, + compression.then_some(tonic::codec::CompressionEncoding::Zstd), + )?); + Ok(Self { + inner, + requests: FuturesUnordered::new(), + }) + } +} + +#[async_trait] +impl Client for RichGrpcClient { + async fn send_get_page( + &mut self, + req_id: u64, + req_lsn: Lsn, + mod_lsn: Lsn, + rel: RelTag, + blks: Vec, + ) -> anyhow::Result<()> { + let req = page_api::GetPageRequest { + request_id: req_id.into(), + request_class: page_api::GetPageClass::Normal, + read_lsn: page_api::ReadLsn { + request_lsn: req_lsn, + not_modified_since_lsn: Some(mod_lsn), + }, + rel, + block_numbers: blks, + }; + let inner = self.inner.clone(); + self.requests.push(Box::pin(async move { + inner + .get_page(req) + .await + .map_err(|err| anyhow::anyhow!("{err}")) + })); + Ok(()) + } + + async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)> { + let resp = self.requests.next().await.unwrap()?; + Ok(( + resp.request_id.id, + resp.pages.into_iter().map(|p| p.image).collect(), + )) } } diff --git a/pageserver/pagebench/src/cmd/idle_streams.rs b/pageserver/pagebench/src/cmd/idle_streams.rs new file mode 100644 index 0000000000..73bc9f3f46 --- /dev/null +++ b/pageserver/pagebench/src/cmd/idle_streams.rs @@ -0,0 +1,127 @@ +use std::sync::Arc; + +use anyhow::anyhow; +use futures::StreamExt; +use tonic::transport::Endpoint; +use tracing::info; + +use pageserver_page_api::{GetPageClass, GetPageRequest, GetPageStatusCode, ReadLsn, RelTag}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; +use utils::shard::ShardIndex; + +/// Starts a large number of idle gRPC GetPage streams. +#[derive(clap::Parser)] +pub(crate) struct Args { + /// The Pageserver to connect to. Must use grpc://. + #[clap(long, default_value = "grpc://localhost:51051")] + server: String, + /// The Pageserver HTTP API. + #[clap(long, default_value = "http://localhost:9898")] + http_server: String, + /// The number of streams to open. + #[clap(long, default_value = "100000")] + count: usize, + /// Number of streams per connection. + #[clap(long, default_value = "100")] + per_connection: usize, + /// Send a single GetPage request on each stream. + #[clap(long, default_value_t = false)] + send_request: bool, +} + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + + rt.block_on(main_impl(args)) +} + +async fn main_impl(args: Args) -> anyhow::Result<()> { + // Discover a tenant and timeline to use. + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + reqwest::Client::new(), + args.http_server.clone(), + None, + )); + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: Some(1), + targets: None, + }, + ) + .await?; + let ttid = timelines + .first() + .ok_or_else(|| anyhow!("no timelines found"))?; + + // Set up the initial client. + let endpoint = Endpoint::from_shared(args.server.clone())?; + + let connect = async || { + pageserver_page_api::Client::new( + endpoint.connect().await?, + ttid.tenant_id, + ttid.timeline_id, + ShardIndex::unsharded(), + None, + None, + ) + }; + + let mut client = connect().await?; + let mut streams = Vec::with_capacity(args.count); + + // Create streams. + for i in 0..args.count { + if i % 100 == 0 { + info!("opened {}/{} streams", i, args.count); + } + if i % args.per_connection == 0 && i > 0 { + client = connect().await?; + } + + let (req_tx, req_rx) = tokio::sync::mpsc::unbounded_channel(); + let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx); + let mut resp_stream = client.get_pages(req_stream).await?; + + // Send request if specified. + if args.send_request { + req_tx.send(GetPageRequest { + request_id: 1.into(), + request_class: GetPageClass::Normal, + read_lsn: ReadLsn { + request_lsn: Lsn::MAX, + not_modified_since_lsn: Some(Lsn(1)), + }, + rel: RelTag { + spcnode: 1664, // pg_global + dbnode: 0, // shared database + relnode: 1262, // pg_authid + forknum: 0, // init + }, + block_numbers: vec![0], + })?; + let resp = resp_stream + .next() + .await + .transpose()? + .ok_or_else(|| anyhow!("no response"))?; + if resp.status_code != GetPageStatusCode::Ok { + return Err(anyhow!("{} response", resp.status_code)); + } + } + + // Hold onto streams to avoid closing them. + streams.push((req_tx, resp_stream)); + } + + info!("opened {} streams, sleeping", args.count); + + // Block forever, to hold the idle streams open for inspection. + futures::future::pending::<()>().await; + + Ok(()) +} diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs index 5527557450..ceca58e032 100644 --- a/pageserver/pagebench/src/main.rs +++ b/pageserver/pagebench/src/main.rs @@ -1,4 +1,7 @@ +use std::fs::File; + use clap::Parser; +use tracing::info; use utils::logging; /// Re-usable pieces of code that aren't CLI-specific. @@ -17,38 +20,73 @@ mod cmd { pub(super) mod aux_files; pub(super) mod basebackup; pub(super) mod getpage_latest_lsn; + pub(super) mod idle_streams; pub(super) mod ondemand_download_churn; pub(super) mod trigger_initial_size_calculation; } /// Component-level performance test for pageserver. #[derive(clap::Parser)] -enum Args { +struct Args { + /// Takes a client CPU profile into profile.svg. The benchmark must exit cleanly before it's + /// written, e.g. via --runtime. + #[arg(long)] + profile: bool, + + #[command(subcommand)] + subcommand: Subcommand, +} + +#[derive(clap::Subcommand)] +enum Subcommand { Basebackup(cmd::basebackup::Args), GetPageLatestLsn(cmd::getpage_latest_lsn::Args), TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args), OndemandDownloadChurn(cmd::ondemand_download_churn::Args), AuxFiles(cmd::aux_files::Args), + IdleStreams(cmd::idle_streams::Args), } -fn main() { +fn main() -> anyhow::Result<()> { logging::init( logging::LogFormat::Plain, logging::TracingErrorLayerEnablement::Disabled, logging::Output::Stderr, - ) - .unwrap(); + )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); let args = Args::parse(); - match args { - Args::Basebackup(args) => cmd::basebackup::main(args), - Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args), - Args::TriggerInitialSizeCalculation(args) => { + + // Start a CPU profile if requested. + let mut profiler = None; + if args.profile { + profiler = Some( + pprof::ProfilerGuardBuilder::default() + .frequency(1000) + .blocklist(&["libc", "libgcc", "pthread", "vdso"]) + .build()?, + ); + } + + match args.subcommand { + Subcommand::Basebackup(args) => cmd::basebackup::main(args), + Subcommand::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args), + Subcommand::TriggerInitialSizeCalculation(args) => { cmd::trigger_initial_size_calculation::main(args) } - Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args), - Args::AuxFiles(args) => cmd::aux_files::main(args), + Subcommand::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args), + Subcommand::AuxFiles(args) => cmd::aux_files::main(args), + Subcommand::IdleStreams(args) => cmd::idle_streams::main(args), + }?; + + // Generate a CPU flamegraph if requested. + if let Some(profiler) = profiler { + let report = profiler.report().build()?; + drop(profiler); // stop profiling + let file = File::create("profile.svg")?; + report.flamegraph(file)?; + info!("wrote CPU profile flamegraph to profile.svg") } - .unwrap() + + Ok(()) } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 36dada1e89..1a44c80e2d 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -114,7 +114,7 @@ where // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the - // "zenith.signal" file, so that postgres can read it during startup. + // "neon.signal" file, so that postgres can read it during startup. // // We don't keep full history of record boundaries in the page server, // however, only the predecessor of the latest record on each @@ -751,34 +751,39 @@ where // // Add generated pg_control file and bootstrap WAL segment. - // Also send zenith.signal file with extra bootstrap data. + // Also send neon.signal and zenith.signal file with extra bootstrap data. // async fn add_pgcontrol_file( &mut self, pg_control_bytes: Bytes, system_identifier: u64, ) -> Result<(), BasebackupError> { - // add zenith.signal file - let mut zenith_signal = String::new(); + // add neon.signal file + let mut neon_signal = String::new(); if self.prev_record_lsn == Lsn(0) { if self.timeline.is_ancestor_lsn(self.lsn) { - write!(zenith_signal, "PREV LSN: none") + write!(neon_signal, "PREV LSN: none") .map_err(|e| BasebackupError::Server(e.into()))?; } else { - write!(zenith_signal, "PREV LSN: invalid") + write!(neon_signal, "PREV LSN: invalid") .map_err(|e| BasebackupError::Server(e.into()))?; } } else { - write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn) + write!(neon_signal, "PREV LSN: {}", self.prev_record_lsn) .map_err(|e| BasebackupError::Server(e.into()))?; } - self.ar - .append( - &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, - zenith_signal.as_bytes(), - ) - .await - .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?; + + // TODO: Remove zenith.signal once all historical computes have been replaced + // ... and thus support the neon.signal file. + for signalfilename in ["neon.signal", "zenith.signal"] { + self.ar + .append( + &new_tar_header(signalfilename, neon_signal.len() as u64)?, + neon_signal.as_bytes(), + ) + .await + .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,neon.signal"))?; + } //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 327384fd82..dfb8b437c3 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -29,8 +29,8 @@ use pageserver::task_mgr::{ }; use pageserver::tenant::{TenantSharedResources, mgr, secondary}; use pageserver::{ - CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http, - page_cache, page_service, task_mgr, virtual_file, + CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, + MetricsCollectionTask, http, page_cache, page_service, task_mgr, virtual_file, }; use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; @@ -41,6 +41,7 @@ use tracing_utils::OtelGuard; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::crashsafe::syncfs; use utils::logging::TracingErrorLayerEnablement; +use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR}; use utils::sentry_init::init_sentry; use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener}; @@ -763,6 +764,41 @@ fn start_pageserver( (http_task, https_task) }; + /* BEGIN_HADRON */ + let metrics_collection_task = { + let cancel = shutdown_pageserver.child_token(); + let task = crate::BACKGROUND_RUNTIME.spawn({ + let cancel = cancel.clone(); + let background_jobs_barrier = background_jobs_barrier.clone(); + async move { + if conf.force_metric_collection_on_scrape { + return; + } + + // first wait until background jobs are cleared to launch. + tokio::select! { + _ = cancel.cancelled() => { return; }, + _ = background_jobs_barrier.wait() => {} + }; + let mut interval = tokio::time::interval(METRICS_COLLECTION_INTERVAL); + loop { + tokio::select! { + _ = cancel.cancelled() => { + tracing::info!("cancelled metrics collection task, exiting..."); + break; + }, + _ = interval.tick() => {} + } + tokio::task::spawn_blocking(|| { + METRICS_COLLECTOR.run_once(true); + }); + } + } + }); + MetricsCollectionTask(CancellableTask { task, cancel }) + }; + /* END_HADRON */ + let consumption_metrics_tasks = { let cancel = shutdown_pageserver.child_token(); let task = crate::BACKGROUND_RUNTIME.spawn({ @@ -844,6 +880,7 @@ fn start_pageserver( https_endpoint_listener, page_service, page_service_grpc, + metrics_collection_task, consumption_metrics_tasks, disk_usage_eviction_task, &tenant_manager, @@ -880,17 +917,15 @@ async fn create_remote_storage_client( // If `test_remote_failures` is non-zero, wrap the client with a // wrapper that simulates failures. if conf.test_remote_failures > 0 { - if !cfg!(feature = "testing") { - anyhow::bail!( - "test_remote_failures option is not available because pageserver was compiled without the 'testing' feature" - ); - } info!( "Simulating remote failures for first {} attempts of each op", conf.test_remote_failures ); - remote_storage = - GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures); + remote_storage = GenericRemoteStorage::unreliable_wrapper( + remote_storage, + conf.test_remote_failures, + conf.test_remote_failures_probability, + ); } Ok(remote_storage) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 99d7e0ca3a..bb73ae1dd5 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -147,7 +147,11 @@ pub struct PageServerConf { pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, + // The number of allowed failures in remote storage operations. pub test_remote_failures: u64, + // The probability of failure in remote storage operations. Only works when test_remote_failures > 1. + // Use 100 for 100% failure, 0 for no failure. + pub test_remote_failures_probability: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, @@ -248,6 +252,14 @@ pub struct PageServerConf { pub timeline_import_config: pageserver_api::config::TimelineImportConfig, pub basebackup_cache_config: Option, + + /// Defines what is a big tenant for the purpose of image layer generation. + /// See Timeline::should_check_if_image_layers_required + pub image_layer_generation_large_timeline_threshold: Option, + + /// Controls whether to collect all metrics on each scrape or to return potentially stale + /// results. + pub force_metric_collection_on_scrape: bool, } /// Token for authentication to safekeepers @@ -392,6 +404,7 @@ impl PageServerConf { synthetic_size_calculation_interval, disk_usage_based_eviction, test_remote_failures, + test_remote_failures_probability, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, control_plane_api, @@ -427,6 +440,8 @@ impl PageServerConf { posthog_config, timeline_import_config, basebackup_cache_config, + image_layer_generation_large_timeline_threshold, + force_metric_collection_on_scrape, } = config_toml; let mut conf = PageServerConf { @@ -461,6 +476,7 @@ impl PageServerConf { synthetic_size_calculation_interval, disk_usage_based_eviction, test_remote_failures, + test_remote_failures_probability, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, control_plane_api: control_plane_api @@ -484,6 +500,8 @@ impl PageServerConf { dev_mode, timeline_import_config, basebackup_cache_config, + image_layer_generation_large_timeline_threshold, + force_metric_collection_on_scrape, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index f1f9aaf43c..be1de43d18 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -194,6 +194,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { listen_http_port: m.http_port, listen_https_port: m.https_port, availability_zone_id: az_id.expect("Checked above"), + node_ip_addr: None, }) } Err(e) => { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 3612686b5d..3e844a375d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2,12 +2,15 @@ //! Management HTTP API //! use std::cmp::Reverse; -use std::collections::{BinaryHeap, HashMap}; +use std::collections::BTreeMap; +use std::collections::BinaryHeap; +use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result, anyhow}; +use bytes::Bytes; use enumset::EnumSet; use futures::future::join_all; use futures::{StreamExt, TryFutureExt}; @@ -44,6 +47,7 @@ use pageserver_api::shard::{ShardCount, TenantShardId}; use postgres_ffi::PgMajorVersion; use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; use scopeguard::defer; +use serde::{Deserialize, Serialize}; use serde_json::json; use tenant_size_model::svg::SvgBranchKind; use tenant_size_model::{SizeResult, StorageModel}; @@ -55,6 +59,7 @@ use utils::auth::SwappableJwtAuth; use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use wal_decoder::models::record::NeonWalRecord; use crate::config::PageServerConf; use crate::context; @@ -75,12 +80,13 @@ use crate::tenant::remote_timeline_client::{ }; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; +use crate::tenant::storage_layer::ValuesReconstructState; use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; use crate::tenant::timeline::{ - CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline, - WaitLsnTimeout, WaitLsnWaiter, import_pgdata, + CompactFlags, CompactOptions, CompactRequest, MarkInvisibleRequest, Timeline, WaitLsnTimeout, + WaitLsnWaiter, import_pgdata, }; use crate::tenant::{ GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError, @@ -395,6 +401,7 @@ async fn build_timeline_info( timeline: &Arc, include_non_incremental_logical_size: bool, force_await_initial_logical_size: bool, + include_image_consistent_lsn: bool, ctx: &RequestContext, ) -> anyhow::Result { crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); @@ -419,6 +426,10 @@ async fn build_timeline_info( .await?, ); } + // HADRON + if include_image_consistent_lsn { + info.image_consistent_lsn = Some(timeline.compute_image_consistent_lsn().await?); + } Ok(info) } @@ -508,6 +519,8 @@ async fn build_timeline_info_common( is_invisible: Some(is_invisible), walreceiver_status, + // HADRON + image_consistent_lsn: None, }; Ok(info) } @@ -710,6 +723,8 @@ async fn timeline_list_handler( parse_query_param(&request, "include-non-incremental-logical-size")?; let force_await_initial_logical_size: Option = parse_query_param(&request, "force-await-initial-logical-size")?; + let include_image_consistent_lsn: Option = + parse_query_param(&request, "include-image-consistent-lsn")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); @@ -730,6 +745,7 @@ async fn timeline_list_handler( &timeline, include_non_incremental_logical_size.unwrap_or(false), force_await_initial_logical_size.unwrap_or(false), + include_image_consistent_lsn.unwrap_or(false), &ctx, ) .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id)) @@ -758,6 +774,9 @@ async fn timeline_and_offloaded_list_handler( parse_query_param(&request, "include-non-incremental-logical-size")?; let force_await_initial_logical_size: Option = parse_query_param(&request, "force-await-initial-logical-size")?; + let include_image_consistent_lsn: Option = + parse_query_param(&request, "include-image-consistent-lsn")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); @@ -778,6 +797,7 @@ async fn timeline_and_offloaded_list_handler( &timeline, include_non_incremental_logical_size.unwrap_or(false), force_await_initial_logical_size.unwrap_or(false), + include_image_consistent_lsn.unwrap_or(false), &ctx, ) .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id)) @@ -962,6 +982,9 @@ async fn timeline_detail_handler( parse_query_param(&request, "include-non-incremental-logical-size")?; let force_await_initial_logical_size: Option = parse_query_param(&request, "force-await-initial-logical-size")?; + // HADRON + let include_image_consistent_lsn: Option = + parse_query_param(&request, "include-image-consistent-lsn")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; // Logical size calculation needs downloading. @@ -982,6 +1005,7 @@ async fn timeline_detail_handler( &timeline, include_non_incremental_logical_size.unwrap_or(false), force_await_initial_logical_size.unwrap_or(false), + include_image_consistent_lsn.unwrap_or(false), ctx, ) .await @@ -2500,9 +2524,10 @@ async fn timeline_checkpoint_handler( .compact(&cancel, flags, &ctx) .await .map_err(|e| - match e { - CompactionError::ShuttingDown => ApiError::ShuttingDown, - CompactionError::Other(e) => ApiError::InternalServerError(e), + if e.is_cancel() { + ApiError::ShuttingDown + } else { + ApiError::InternalServerError(e.into_anyhow()) } )?; } @@ -2687,6 +2712,16 @@ async fn deletion_queue_flush( } } +/// Try if `GetPage@Lsn` is successful, useful for manual debugging. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +struct GetPageResponse { + pub page: Bytes, + pub layers_visited: u32, + pub delta_layers_visited: u32, + pub records: Vec<(Lsn, NeonWalRecord)>, + pub img: Option<(Lsn, Bytes)>, +} + async fn getpage_at_lsn_handler( request: Request, cancel: CancellationToken, @@ -2737,21 +2772,24 @@ async fn getpage_at_lsn_handler_inner( // Use last_record_lsn if no lsn is provided let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); - let page = timeline.get(key.0, lsn, &ctx).await?; if touch { json_response(StatusCode::OK, ()) } else { - Result::<_, ApiError>::Ok( - Response::builder() - .status(StatusCode::OK) - .header(header::CONTENT_TYPE, "application/octet-stream") - .body(hyper::Body::from(page)) - .unwrap(), - ) + let mut reconstruct_state = ValuesReconstructState::new_with_debug(IoConcurrency::sequential()); + let page = timeline.debug_get(key.0, lsn, &ctx, &mut reconstruct_state).await?; + let response = GetPageResponse { + page, + layers_visited: reconstruct_state.get_layers_visited(), + delta_layers_visited: reconstruct_state.get_delta_layers_visited(), + records: reconstruct_state.debug_state.records.clone(), + img: reconstruct_state.debug_state.img.clone(), + }; + + json_response(StatusCode::OK, response) } } - .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) + .instrument(info_span!("timeline_debug_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } @@ -3213,6 +3251,30 @@ async fn get_utilization( .map_err(ApiError::InternalServerError) } +/// HADRON +async fn list_tenant_visible_size_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&request, None)?; + let state = get_state(&request); + + let mut map = BTreeMap::new(); + for (tenant_shard_id, slot) in state.tenant_manager.list() { + match slot { + TenantSlot::Attached(tenant) => { + let visible_size = tenant.get_visible_size(); + map.insert(tenant_shard_id, visible_size); + } + TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => { + continue; + } + } + } + + json_response(StatusCode::OK, map) +} + async fn list_aux_files( mut request: Request, _cancel: CancellationToken, @@ -3616,6 +3678,7 @@ async fn activate_post_import_handler( let timeline_info = build_timeline_info( &timeline, false, // include_non_incremental_logical_size, false, // force_await_initial_logical_size + false, // include_image_consistent_lsn &ctx, ) .await @@ -3937,9 +4000,14 @@ pub fn make_router( .expect("construct launch timestamp header middleware"), ); + let force_metric_collection_on_scrape = state.conf.force_metric_collection_on_scrape; + + let prometheus_metrics_handler_wrapper = + move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape); + Ok(router .data(state) - .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/metrics", move |r| request_span(r, prometheus_metrics_handler_wrapper)) .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) .get("/profile/heap", |r| request_span(r, profile_heap_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) @@ -4132,7 +4200,7 @@ pub fn make_router( }) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage", - |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler), + |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler), ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage", @@ -4145,6 +4213,7 @@ pub fn make_router( .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler)) .get("/v1/utilization", |r| api_handler(r, get_utilization)) + .get("/v1/list_tenant_visible_size", |r| api_handler(r, list_tenant_visible_size_handler)) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files", |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files), diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 96fe0c1078..409cc2e3c5 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -610,13 +610,13 @@ async fn import_file( debug!("imported twophase file"); } else if file_path.starts_with("pg_wal") { debug!("found wal file in base section. ignore it"); - } else if file_path.starts_with("zenith.signal") { + } else if file_path.starts_with("zenith.signal") || file_path.starts_with("neon.signal") { // Parse zenith signal file to set correct previous LSN let bytes = read_all_bytes(reader).await?; - // zenith.signal format is "PREV LSN: prev_lsn" + // neon.signal format is "PREV LSN: prev_lsn" // TODO write serialization and deserialization in the same place. - let zenith_signal = std::str::from_utf8(&bytes)?.trim(); - let prev_lsn = match zenith_signal { + let neon_signal = std::str::from_utf8(&bytes)?.trim(); + let prev_lsn = match neon_signal { "PREV LSN: none" => Lsn(0), "PREV LSN: invalid" => Lsn(0), other => { @@ -624,17 +624,17 @@ async fn import_file( split[1] .trim() .parse::() - .context("can't parse zenith.signal")? + .context("can't parse neon.signal")? } }; - // zenith.signal is not necessarily the last file, that we handle + // neon.signal is not necessarily the last file, that we handle // but it is ok to call `finish_write()`, because final `modification.commit()` // will update lsn once more to the final one. let writer = modification.tline.writer().await; writer.finish_write(prev_lsn); - debug!("imported zenith signal {}", prev_lsn); + debug!("imported neon signal {}", prev_lsn); } else if file_path.starts_with("pg_tblspc") { // TODO Backups exported from neon won't have pg_tblspc, but we will need // this to import arbitrary postgres databases. diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 0dd3c465e0..0864026f6b 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -73,6 +73,9 @@ pub struct HttpEndpointListener(pub CancellableTask); pub struct HttpsEndpointListener(pub CancellableTask); pub struct ConsumptionMetricsTasks(pub CancellableTask); pub struct DiskUsageEvictionTask(pub CancellableTask); +// HADRON +pub struct MetricsCollectionTask(pub CancellableTask); + impl CancellableTask { pub async fn shutdown(self) { self.cancel.cancel(); @@ -87,6 +90,7 @@ pub async fn shutdown_pageserver( https_listener: Option, page_service: page_service::Listener, grpc_task: Option, + metrics_collection_task: MetricsCollectionTask, consumption_metrics_worker: ConsumptionMetricsTasks, disk_usage_eviction_task: Option, tenant_manager: &TenantManager, @@ -211,6 +215,14 @@ pub async fn shutdown_pageserver( // Best effort to persist any outstanding deletions, to avoid leaking objects deletion_queue.shutdown(Duration::from_secs(5)).await; + // HADRON + timed( + metrics_collection_task.0.shutdown(), + "shutdown metrics collections metrics", + Duration::from_secs(1), + ) + .await; + timed( consumption_metrics_worker.0.shutdown(), "shutdown consumption metrics", diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index eb89e166b2..1b783326a0 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2847,6 +2847,24 @@ pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy = Lazy::new(|| .expect("failed to define a metric") }); +// Global counter for PageStream request results by outcome. Outcomes are divided into 3 categories: +// - success +// - internal_error: errors that indicate bugs in the storage cluster (e.g. page reconstruction errors, misrouted requests, LSN timeout errors) +// - other_error: transient error conditions that are expected in normal operation or indicate bugs with other parts of the system (e.g. error due to pageserver shutdown, malformed requests etc.) +pub(crate) static PAGESTREAM_HANDLER_RESULTS_TOTAL: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_pagestream_handler_results_total", + "Number of pageserver pagestream handler results by outcome (success, internal_error, other_error)", + &["outcome"] + ) + .expect("failed to define a metric") +}); + +// Constants for pageserver_pagestream_handler_results_total's outcome labels +pub(crate) const PAGESTREAM_HANDLER_OUTCOME_SUCCESS: &str = "success"; +pub(crate) const PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR: &str = "internal_error"; +pub(crate) const PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR: &str = "other_error"; + // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 70fdb2e789..1fc7e4eac7 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -70,7 +70,7 @@ use crate::context::{ }; use crate::metrics::{ self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS, - MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics, + MISROUTED_PAGESTREAM_REQUESTS, PAGESTREAM_HANDLER_RESULTS_TOTAL, SmgrOpTimer, TimelineMetrics, }; use crate::pgdatadir_mapping::{LsnRange, Version}; use crate::span::{ @@ -1441,20 +1441,57 @@ impl PageServerHandler { let (response_msg, ctx) = match handler_result { Err(e) => match &e.err { PageStreamError::Shutdown => { + // BEGIN HADRON + PAGESTREAM_HANDLER_RESULTS_TOTAL + .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR]) + .inc(); + // END HADRON + // If we fail to fulfil a request during shutdown, which may be _because_ of // shutdown, then do not send the error to the client. Instead just drop the // connection. span.in_scope(|| info!("dropping connection due to shutdown")); return Err(QueryError::Shutdown); } - PageStreamError::Reconnect(reason) => { - span.in_scope(|| info!("handler requested reconnect: {reason}")); + PageStreamError::Reconnect(_reason) => { + span.in_scope(|| { + // BEGIN HADRON + // We can get here because the compute node is pointing at the wrong PS. We + // already have a metric to keep track of this so suppressing this log to + // reduce log spam. The information in this log message is not going to be that + // helpful given the volume of logs that can be generated. + // info!("handler requested reconnect: {reason}") + // END HADRON + }); + // BEGIN HADRON + PAGESTREAM_HANDLER_RESULTS_TOTAL + .with_label_values(&[ + metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR, + ]) + .inc(); + // END HADRON return Err(QueryError::Reconnect); } PageStreamError::Read(_) | PageStreamError::LsnTimeout(_) | PageStreamError::NotFound(_) | PageStreamError::BadRequest(_) => { + // BEGIN HADRON + if let PageStreamError::Read(_) | PageStreamError::LsnTimeout(_) = &e.err { + PAGESTREAM_HANDLER_RESULTS_TOTAL + .with_label_values(&[ + metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR, + ]) + .inc(); + } else { + PAGESTREAM_HANDLER_RESULTS_TOTAL + .with_label_values(&[ + metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR, + ]) + .inc(); + } + // END HADRON + // print the all details to the log with {:#}, but for the client the // error message is enough. Do not log if shutting down, as the anyhow::Error // here includes cancellation which is not an error. @@ -1472,7 +1509,15 @@ impl PageServerHandler { ) } }, - Ok((response_msg, _op_timer_already_observed, ctx)) => (response_msg, Some(ctx)), + Ok((response_msg, _op_timer_already_observed, ctx)) => { + // BEGIN HADRON + PAGESTREAM_HANDLER_RESULTS_TOTAL + .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_SUCCESS]) + .inc(); + // END HADRON + + (response_msg, Some(ctx)) + } }; let ctx = ctx.map(|req_ctx| { @@ -3293,9 +3338,12 @@ impl GrpcPageServiceHandler { } /// Generates a PagestreamRequest header from a ReadLsn and request ID. - fn make_hdr(read_lsn: page_api::ReadLsn, req_id: u64) -> PagestreamRequest { + fn make_hdr( + read_lsn: page_api::ReadLsn, + req_id: Option, + ) -> PagestreamRequest { PagestreamRequest { - reqid: req_id, + reqid: req_id.map(|r| r.id).unwrap_or_default(), request_lsn: read_lsn.request_lsn, not_modified_since: read_lsn .not_modified_since_lsn @@ -3405,7 +3453,7 @@ impl GrpcPageServiceHandler { batch.push(BatchedGetPageRequest { req: PagestreamGetPageRequest { - hdr: Self::make_hdr(req.read_lsn, req.request_id), + hdr: Self::make_hdr(req.read_lsn, Some(req.request_id)), rel: req.rel, blkno, }, @@ -3435,12 +3483,16 @@ impl GrpcPageServiceHandler { request_id: req.request_id, status_code: page_api::GetPageStatusCode::Ok, reason: None, - page_images: Vec::with_capacity(results.len()), + rel: req.rel, + pages: Vec::with_capacity(results.len()), }; for result in results { match result { - Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.page_images.push(r.page), + Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.pages.push(page_api::Page { + block_number: r.req.blkno, + image: r.page, + }), Ok((resp, _, _)) => { return Err(tonic::Status::internal(format!( "unexpected response: {resp:?}" @@ -3483,7 +3535,7 @@ impl proto::PageService for GrpcPageServiceHandler { span_record!(rel=%req.rel, lsn=%req.read_lsn); let req = PagestreamExistsRequest { - hdr: Self::make_hdr(req.read_lsn, 0), + hdr: Self::make_hdr(req.read_lsn, None), rel: req.rel, }; @@ -3633,7 +3685,7 @@ impl proto::PageService for GrpcPageServiceHandler { span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn); let req = PagestreamDbSizeRequest { - hdr: Self::make_hdr(req.read_lsn, 0), + hdr: Self::make_hdr(req.read_lsn, None), dbnode: req.db_oid, }; @@ -3683,7 +3735,7 @@ impl proto::PageService for GrpcPageServiceHandler { .await? .downgrade(); while let Some(req) = reqs.message().await? { - let req_id = req.request_id; + let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default(); let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone()) .instrument(span.clone()) // propagate request span .await; @@ -3722,7 +3774,7 @@ impl proto::PageService for GrpcPageServiceHandler { span_record!(rel=%req.rel, lsn=%req.read_lsn); let req = PagestreamNblocksRequest { - hdr: Self::make_hdr(req.read_lsn, 0), + hdr: Self::make_hdr(req.read_lsn, None), rel: req.rel, }; @@ -3755,7 +3807,7 @@ impl proto::PageService for GrpcPageServiceHandler { span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn); let req = PagestreamGetSlruSegmentRequest { - hdr: Self::make_hdr(req.read_lsn, 0), + hdr: Self::make_hdr(req.read_lsn, None), kind: req.kind as u8, segno: req.segno, }; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f576119db8..1a3016e7f1 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3291,7 +3291,7 @@ impl TenantShard { // Ignore this, we likely raced with unarchival. OffloadError::NotArchived => Ok(()), OffloadError::AlreadyInProgress => Ok(()), - OffloadError::Cancelled => Err(CompactionError::ShuttingDown), + OffloadError::Cancelled => Err(CompactionError::new_cancelled()), // don't break the anyhow chain OffloadError::Other(err) => Err(CompactionError::Other(err)), })?; @@ -3321,16 +3321,13 @@ impl TenantShard { /// Trips the compaction circuit breaker if appropriate. pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) { - match err { - err if err.is_cancel() => {} - CompactionError::ShuttingDown => (), - CompactionError::Other(err) => { - self.compaction_circuit_breaker - .lock() - .unwrap() - .fail(&CIRCUIT_BREAKERS_BROKEN, err); - } + if err.is_cancel() { + return; } + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, err); } /// Cancel scheduled compaction tasks @@ -3396,7 +3393,13 @@ impl TenantShard { .collect_vec(); for timeline in timelines { - timeline.maybe_freeze_ephemeral_layer().await; + // Include a span with the timeline ID. The parent span already has the tenant ID. + let span = + info_span!("maybe_freeze_ephemeral_layer", timeline_id = %timeline.timeline_id); + timeline + .maybe_freeze_ephemeral_layer() + .instrument(span) + .await; } } @@ -4174,6 +4177,15 @@ impl TenantShard { .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } + // HADRON + pub fn get_image_creation_timeout(&self) -> Option { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf.image_layer_force_creation_period.or(self + .conf + .default_tenant_conf + .image_layer_force_creation_period) + } + pub fn get_pitr_interval(&self) -> Duration { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -5713,6 +5725,16 @@ impl TenantShard { .unwrap_or(0) } + /// HADRON + /// Return the visible size of all timelines in this tenant. + pub(crate) fn get_visible_size(&self) -> u64 { + let timelines = self.timelines.lock().unwrap(); + timelines + .values() + .map(|t| t.metrics.visible_physical_size_gauge.get()) + .sum() + } + /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant /// manifest in `Self::remote_tenant_manifest`. /// @@ -12800,6 +12822,40 @@ mod tests { }, ] ); + + Ok(()) + } + + #[tokio::test] + async fn test_get_force_image_creation_lsn() -> anyhow::Result<()> { + let tenant_conf = pageserver_api::models::TenantConfig { + pitr_interval: Some(Duration::from_secs(7 * 3600)), + image_layer_force_creation_period: Some(Duration::from_secs(3600)), + ..Default::default() + }; + + let tenant_id = TenantId::generate(); + + let harness = TenantHarness::create_custom( + "test_get_force_image_creation_lsn", + tenant_conf, + tenant_id, + ShardIdentity::unsharded(), + Generation::new(1), + ) + .await?; + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + timeline.gc_info.write().unwrap().cutoffs.time = Some(Lsn(100)); + { + let writer = timeline.writer().await; + writer.finish_write(Lsn(5000)); + } + + let image_creation_lsn = timeline.get_force_image_creation_lsn().unwrap(); + assert_eq!(image_creation_lsn, Lsn(4300)); Ok(()) } } diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 23052ccee7..ba02602cfe 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -46,10 +46,11 @@ mod historic_layer_coverage; mod layer_coverage; -use std::collections::{HashMap, VecDeque}; +use std::collections::{BTreeMap, HashMap, VecDeque}; use std::iter::Peekable; use std::ops::Range; use std::sync::Arc; +use std::time::Instant; use anyhow::Result; use historic_layer_coverage::BufferedHistoricLayerCoverage; @@ -904,6 +905,103 @@ impl LayerMap { max_stacked_deltas } + /* BEGIN_HADRON */ + /** + * Compute the image consistent LSN, the largest LSN below which all pages have been redone successfully. + * It works by first finding the latest image layers and store them into a map. Then for each delta layer, + * find all overlapping image layers in order to potentially increase the image LSN in case there are gaps + * (e.g., if an image is created at LSN 100 but the delta layer spans LSN [150, 200], then we can increase + * image LSN to 150 because there is no WAL record in between). + * Finally, the image consistent LSN is computed by taking the minimum of all image layers. + */ + pub fn compute_image_consistent_lsn(&self, disk_consistent_lsn: Lsn) -> Lsn { + struct ImageLayerInfo { + // creation LSN of the image layer + image_lsn: Lsn, + // the current minimum LSN of newer delta layers with overlapping key ranges + min_delta_lsn: Lsn, + } + let started_at = Instant::now(); + + let min_l0_deltas_lsn = { + let l0_deltas = self.level0_deltas(); + l0_deltas + .iter() + .map(|layer| layer.get_lsn_range().start) + .min() + .unwrap_or(disk_consistent_lsn) + }; + let global_key_range = Key::MIN..Key::MAX; + + // step 1: collect all most recent image layers into a map + // map: end key to image_layer_info + let mut image_map: BTreeMap = BTreeMap::new(); + for (img_range, img) in self.image_coverage(&global_key_range, disk_consistent_lsn) { + let img_lsn = img.map(|layer| layer.get_lsn_range().end).unwrap_or(Lsn(0)); + image_map.insert( + img_range.end, + ImageLayerInfo { + image_lsn: img_lsn, + min_delta_lsn: min_l0_deltas_lsn, + }, + ); + } + + // step 2: go through all delta layers, and update the image layer info with overlapping + // key ranges + for layer in self.historic.iter() { + if !layer.is_delta { + continue; + } + let delta_key_range = layer.get_key_range(); + let delta_lsn_range = layer.get_lsn_range(); + for (img_end_key, img_info) in image_map.range_mut(delta_key_range.start..Key::MAX) { + debug_assert!(img_end_key >= &delta_key_range.start); + if delta_lsn_range.end > img_info.image_lsn { + // the delta layer includes WAL records after the image + // it's possibel that the delta layer's start LSN < image LSN, which will be simply ignored by step 3 + img_info.min_delta_lsn = + std::cmp::min(img_info.min_delta_lsn, delta_lsn_range.start); + } + if img_end_key >= &delta_key_range.end { + // we have fully processed all overlapping image layers + break; + } + } + } + + // step 3, go through all image layers and find the image consistent LSN + let mut img_consistent_lsn = min_l0_deltas_lsn.checked_sub(Lsn(1)).unwrap(); + let mut prev_key = Key::MIN; + for (img_key, img_info) in image_map { + tracing::debug!( + "Image layer {:?}:{} has min delta lsn {}", + Range { + start: prev_key, + end: img_key, + }, + img_info.image_lsn, + img_info.min_delta_lsn, + ); + let image_lsn = std::cmp::max( + img_info.image_lsn, + img_info.min_delta_lsn.checked_sub(Lsn(1)).unwrap_or(Lsn(0)), + ); + img_consistent_lsn = std::cmp::min(img_consistent_lsn, image_lsn); + prev_key = img_key; + } + tracing::info!( + "computed image_consistent_lsn {} for disk_consistent_lsn {} in {}ms. Processed {} layrs in total.", + img_consistent_lsn, + disk_consistent_lsn, + started_at.elapsed().as_millis(), + self.historic.len() + ); + img_consistent_lsn + } + + /* END_HADRON */ + /// Return all L0 delta layers pub fn level0_deltas(&self) -> &Vec> { &self.l0_delta_layers @@ -1579,6 +1677,138 @@ mod tests { LayerVisibilityHint::Visible )); } + + /* BEGIN_HADRON */ + #[test] + fn test_compute_image_consistent_lsn() { + let mut layer_map = LayerMap::default(); + + let disk_consistent_lsn = Lsn(1000); + // case 1: empty layer map + let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); + assert_eq!( + disk_consistent_lsn.checked_sub(Lsn(1)).unwrap(), + image_consistent_lsn + ); + + // case 2: only L0 delta layer + { + let mut updates = layer_map.batch_update(); + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(0)..Key::from_i128(100), + Lsn(900)..Lsn(990), + true, + )); + + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(0)..Key::from_i128(100), + Lsn(850)..Lsn(899), + true, + )); + } + + // should use min L0 delta LSN - 1 as image consistent LSN + let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); + assert_eq!(Lsn(849), image_consistent_lsn); + + // case 3: 3 images, no L1 delta + { + let mut updates = layer_map.batch_update(); + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(0)..Key::from_i128(40), + Lsn(100)..Lsn(100), + false, + )); + + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(40)..Key::from_i128(70), + Lsn(200)..Lsn(200), + false, + )); + + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(70)..Key::from_i128(100), + Lsn(150)..Lsn(150), + false, + )); + } + // should use min L0 delta LSN - 1 as image consistent LSN + let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); + assert_eq!(Lsn(849), image_consistent_lsn); + + // case 4: 3 images with 1 L1 delta + { + let mut updates = layer_map.batch_update(); + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(0)..Key::from_i128(50), + Lsn(300)..Lsn(350), + true, + )); + } + let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); + assert_eq!(Lsn(299), image_consistent_lsn); + + // case 5: 3 images with 1 more L1 delta with smaller LSN + { + let mut updates = layer_map.batch_update(); + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(50)..Key::from_i128(72), + Lsn(200)..Lsn(300), + true, + )); + } + let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); + assert_eq!(Lsn(199), image_consistent_lsn); + + // case 6: 3 images with more newer L1 deltas (no impact on final results) + { + let mut updates = layer_map.batch_update(); + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(0)..Key::from_i128(30), + Lsn(400)..Lsn(500), + true, + )); + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(35)..Key::from_i128(100), + Lsn(450)..Lsn(600), + true, + )); + } + let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); + assert_eq!(Lsn(199), image_consistent_lsn); + + // case 7: 3 images with more older L1 deltas (no impact on final results) + { + let mut updates = layer_map.batch_update(); + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(0)..Key::from_i128(40), + Lsn(0)..Lsn(50), + true, + )); + + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(50)..Key::from_i128(100), + Lsn(10)..Lsn(60), + true, + )); + } + let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); + assert_eq!(Lsn(199), image_consistent_lsn); + + // case 8: 3 images with one more L1 delta with overlapping LSN range + { + let mut updates = layer_map.batch_update(); + updates.insert_historic(PersistentLayerDesc::new_test( + Key::from_i128(0)..Key::from_i128(50), + Lsn(50)..Lsn(250), + true, + )); + } + let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn); + assert_eq!(Lsn(100), image_consistent_lsn); + } + + /* END_HADRON */ } #[cfg(test)] diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 15853d3614..52f67abde5 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1678,6 +1678,8 @@ impl TenantManager { // Phase 6: Release the InProgress on the parent shard drop(parent_slot_guard); + utils::pausable_failpoint!("shard-split-post-finish-pause"); + Ok(child_shards) } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9fbb9d2438..43ea8fffa3 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -75,7 +75,7 @@ where /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data' /// call, to collect more records. /// -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub(crate) struct ValueReconstructState { pub(crate) records: Vec<(Lsn, NeonWalRecord)>, pub(crate) img: Option<(Lsn, Bytes)>, @@ -308,6 +308,9 @@ pub struct ValuesReconstructState { layers_visited: u32, delta_layers_visited: u32, + pub(crate) enable_debug: bool, + pub(crate) debug_state: ValueReconstructState, + pub(crate) io_concurrency: IoConcurrency, num_active_ios: Arc, @@ -657,6 +660,23 @@ impl ValuesReconstructState { layers_visited: 0, delta_layers_visited: 0, io_concurrency, + enable_debug: false, + debug_state: ValueReconstructState::default(), + num_active_ios: Arc::new(AtomicUsize::new(0)), + read_path: None, + } + } + + pub(crate) fn new_with_debug(io_concurrency: IoConcurrency) -> Self { + Self { + keys: HashMap::new(), + keys_done: KeySpaceRandomAccum::new(), + keys_with_image_coverage: None, + layers_visited: 0, + delta_layers_visited: 0, + io_concurrency, + enable_debug: true, + debug_state: ValueReconstructState::default(), num_active_ios: Arc::new(AtomicUsize::new(0)), read_path: None, } @@ -670,6 +690,12 @@ impl ValuesReconstructState { self.io_concurrency.spawn_io(fut).await; } + pub(crate) fn set_debug_state(&mut self, debug_state: &ValueReconstructState) { + if self.enable_debug { + self.debug_state = debug_state.clone(); + } + } + pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) { self.layers_visited += 1; if let ReadableLayer::PersistentLayer(layer) = layer { diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index 0f7995f87b..973852defc 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -225,7 +225,7 @@ impl fmt::Display for ImageLayerName { /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path]) /// and [`crate::tenant::storage_layer::layer::local_layer_path`]) -#[derive(Debug, PartialEq, Eq, Hash, Clone)] +#[derive(Debug, PartialEq, Eq, Hash, Clone, Ord, PartialOrd)] pub enum LayerName { Image(ImageLayerName), Delta(DeltaLayerName), diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index bcece5589a..08fc7d61a5 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -17,17 +17,14 @@ use tracing::*; use utils::backoff::exponential_backoff_duration; use utils::completion::Barrier; use utils::pausable_failpoint; -use utils::sync::gate::GateError; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind}; -use crate::tenant::blob_io::WriteBlobError; use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::compaction::CompactionOutcome; use crate::tenant::{TenantShard, TenantState}; -use crate::virtual_file::owned_buffers_io::write::FlushTaskError; /// Semaphore limiting concurrent background tasks (across all tenants). /// @@ -310,45 +307,12 @@ pub(crate) fn log_compaction_error( task_cancelled: bool, degrade_to_warning: bool, ) { - use CompactionError::*; + let is_cancel = err.is_cancel(); - use crate::tenant::PageReconstructError; - use crate::tenant::upload_queue::NotInitialized; - - let level = match err { - e if e.is_cancel() => return, - ShuttingDown => return, - _ if task_cancelled => Level::INFO, - Other(err) => { - let root_cause = err.root_cause(); - - let upload_queue = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_stopping()); - let timeline = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_cancel()); - let buffered_writer_flush_task_canelled = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_cancel()); - let write_blob_cancelled = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_cancel()); - let gate_closed = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_cancel()); - let is_stopping = upload_queue - || timeline - || buffered_writer_flush_task_canelled - || write_blob_cancelled - || gate_closed; - - if is_stopping { - Level::INFO - } else { - Level::ERROR - } - } + let level = if is_cancel || task_cancelled { + Level::INFO + } else { + Level::ERROR }; if let Some((error_count, sleep_duration)) = retry_info { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6088f40669..73d2d72b59 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1002,7 +1002,7 @@ impl From for tonic::Status { impl From for CompactionError { fn from(e: CreateImageLayersError) -> Self { match e { - CreateImageLayersError::Cancelled => CompactionError::ShuttingDown, + CreateImageLayersError::Cancelled => CompactionError::new_cancelled(), CreateImageLayersError::Other(e) => { CompactionError::Other(e.context("create image layers")) } @@ -1253,6 +1253,57 @@ impl Timeline { } } + #[inline(always)] + pub(crate) async fn debug_get( + &self, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + reconstruct_state: &mut ValuesReconstructState, + ) -> Result { + if !lsn.is_valid() { + return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); + } + + // This check is debug-only because of the cost of hashing, and because it's a double-check: we + // already checked the key against the shard_identity when looking up the Timeline from + // page_service. + debug_assert!(!self.shard_identity.is_key_disposable(&key)); + + let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn); + let vectored_res = self + .debug_get_vectored_impl(query, reconstruct_state, ctx) + .await; + + let key_value = vectored_res?.pop_first(); + match key_value { + Some((got_key, value)) => { + if got_key != key { + error!( + "Expected {}, but singular vectored get returned {}", + key, got_key + ); + Err(PageReconstructError::Other(anyhow!( + "Singular vectored get returned wrong key" + ))) + } else { + value + } + } + None => Err(PageReconstructError::MissingKey(Box::new( + MissingKeyError { + keyspace: KeySpace::single(key..key.next()), + shard: self.shard_identity.get_shard_number(&key), + original_hwm_lsn: lsn, + ancestor_lsn: None, + backtrace: None, + read_path: None, + query: None, + }, + ))), + } + } + pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100; /// Look up multiple page versions at a given LSN @@ -1547,6 +1598,98 @@ impl Timeline { Ok(results) } + // A copy of the get_vectored_impl method except that we store the image and wal records into `reconstruct_state`. + // This is only used in the http getpage call for debugging purpose. + pub(super) async fn debug_get_vectored_impl( + &self, + query: VersionedKeySpaceQuery, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + if query.is_empty() { + return Ok(BTreeMap::default()); + } + + let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() { + Some(ReadPath::new( + query.total_keyspace(), + query.high_watermark_lsn()?, + )) + } else { + None + }; + + reconstruct_state.read_path = read_path; + + let traversal_res: Result<(), _> = self + .get_vectored_reconstruct_data(query.clone(), reconstruct_state, ctx) + .await; + + if let Err(err) = traversal_res { + // Wait for all the spawned IOs to complete. + // See comments on `spawn_io` inside `storage_layer` for more details. + let mut collect_futs = std::mem::take(&mut reconstruct_state.keys) + .into_values() + .map(|state| state.collect_pending_ios()) + .collect::>(); + while collect_futs.next().await.is_some() {} + return Err(err); + }; + + let reconstruct_state = Arc::new(Mutex::new(reconstruct_state)); + let futs = FuturesUnordered::new(); + + for (key, state) in std::mem::take(&mut reconstruct_state.lock().unwrap().keys) { + let req_lsn_for_key = query.map_key_to_lsn(&key); + futs.push({ + let walredo_self = self.myself.upgrade().expect("&self method holds the arc"); + let rc_clone = Arc::clone(&reconstruct_state); + + async move { + assert_eq!(state.situation, ValueReconstructSituation::Complete); + + let converted = match state.collect_pending_ios().await { + Ok(ok) => ok, + Err(err) => { + return (key, Err(err)); + } + }; + DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64); + + // The walredo module expects the records to be descending in terms of Lsn. + // And we submit the IOs in that order, so, there shuold be no need to sort here. + debug_assert!( + converted + .records + .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)), + "{converted:?}" + ); + { + let mut guard = rc_clone.lock().unwrap(); + guard.set_debug_state(&converted); + } + ( + key, + walredo_self + .reconstruct_value( + key, + req_lsn_for_key, + converted, + RedoAttemptType::ReadPage, + ) + .await, + ) + } + }); + } + + let results = futs + .collect::>>() + .await; + + Ok(results) + } + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. pub(crate) fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last @@ -1893,6 +2036,8 @@ impl Timeline { // an ephemeral layer open forever when idle. It also freezes layers if the global limit on // ephemeral layer bytes has been breached. pub(super) async fn maybe_freeze_ephemeral_layer(&self) { + debug_assert_current_span_has_tenant_and_timeline_id(); + let Ok(mut write_guard) = self.write_lock.try_lock() else { // If the write lock is held, there is an active wal receiver: rolling open layers // is their responsibility while they hold this lock. @@ -2117,12 +2262,7 @@ impl Timeline { match &result { Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), Err(e) if e.is_cancel() => {} - Err(CompactionError::ShuttingDown) => { - // Covered by the `Err(e) if e.is_cancel()` branch. - } - Err(CompactionError::Other(_)) => { - self.compaction_failed.store(true, AtomicOrdering::Relaxed) - } + Err(_) => self.compaction_failed.store(true, AtomicOrdering::Relaxed), }; result @@ -2851,6 +2991,18 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } + // HADRON + fn get_image_layer_force_creation_period(&self) -> Option { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .image_layer_force_creation_period + .or(self + .conf + .default_tenant_conf + .image_layer_force_creation_period) + } + fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings { let tenant_conf = &self.tenant_conf.load(); tenant_conf @@ -3120,7 +3272,6 @@ impl Timeline { repartition_threshold: 0, last_image_layer_creation_check_at: AtomicLsn::new(0), last_image_layer_creation_check_instant: Mutex::new(None), - last_received_wal: Mutex::new(None), rel_size_latest_cache: RwLock::new(HashMap::new()), rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)), @@ -5041,6 +5192,7 @@ impl Timeline { .create_image_layers( &partitions, self.initdb_lsn, + None, ImageLayerCreationMode::Initial, ctx, LastImageLayerCreationStatus::Initial, @@ -5312,14 +5464,19 @@ impl Timeline { } // Is it time to create a new image layer for the given partition? True if we want to generate. - async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool { + async fn time_for_new_image_layer( + &self, + partition: &KeySpace, + lsn: Lsn, + force_image_creation_lsn: Option, + ) -> bool { let threshold = self.get_image_creation_threshold(); let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; let Ok(layers) = guard.layer_map() else { return false; }; - + let mut min_image_lsn: Lsn = Lsn::MAX; let mut max_deltas = 0; for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn); @@ -5354,9 +5511,25 @@ impl Timeline { return true; } } + min_image_lsn = min(min_image_lsn, img_lsn); } } + // HADRON + // for child timelines, we consider all pages up to ancestor_LSN are redone successfully by the parent timeline + min_image_lsn = min_image_lsn.max(self.get_ancestor_lsn()); + if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 { + info!( + "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}, num deltas: {}", + partition.ranges[0].start, + partition.ranges[0].end, + min_image_lsn, + force_image_creation_lsn.unwrap(), + max_deltas + ); + return true; + } + debug!( max_deltas, "none of the partitioned ranges had >= {threshold} deltas" @@ -5576,13 +5749,14 @@ impl Timeline { /// Predicate function which indicates whether we should check if new image layers /// are required. Since checking if new image layers are required is expensive in /// terms of CPU, we only do it in the following cases: - /// 1. If the timeline has ingested sufficient WAL to justify the cost + /// 1. If the timeline has ingested sufficient WAL to justify the cost or ... /// 2. If enough time has passed since the last check: /// 1. For large tenants, we wish to perform the check more often since they - /// suffer from the lack of image layers + /// suffer from the lack of image layers. Note that we assume sharded tenants + /// to be large since non-zero shards do not track the logical size. /// 2. For small tenants (that can mostly fit in RAM), we use a much longer interval fn should_check_if_image_layers_required(self: &Arc, lsn: Lsn) -> bool { - const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024; + let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold; let last_checks_at = self.last_image_layer_creation_check_at.load(); let distance = lsn @@ -5593,30 +5767,39 @@ impl Timeline { let distance_based_decision = distance.0 >= min_distance; - let mut time_based_decision = false; let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap(); - if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() { - let check_required_after = if Into::::into(&logical_size) >= LARGE_TENANT_THRESHOLD - { - self.get_checkpoint_timeout() - } else { - Duration::from_secs(3600 * 48) - }; - - time_based_decision = match *last_check_instant { - Some(last_check) => { - let elapsed = last_check.elapsed(); - elapsed >= check_required_after + let check_required_after = (|| { + if self.shard_identity.is_unsharded() { + if let CurrentLogicalSize::Exact(logical_size) = + self.current_logical_size.current_size() + { + if Some(Into::::into(&logical_size)) < large_timeline_threshold { + return Duration::from_secs(3600 * 48); + } } - None => true, - }; - } + } + + self.get_checkpoint_timeout() + })(); + + let time_based_decision = match *last_check_instant { + Some(last_check) => { + let elapsed = last_check.elapsed(); + elapsed >= check_required_after + } + None => true, + }; // Do the expensive delta layer counting only if this timeline has ingested sufficient // WAL since the last check or a checkpoint timeout interval has elapsed since the last // check. let decision = distance_based_decision || time_based_decision; - + tracing::info!( + "Decided to check image layers: {}. Distance-based decision: {}, time-based decision: {}", + decision, + distance_based_decision, + time_based_decision + ); if decision { self.last_image_layer_creation_check_at.store(lsn); *last_check_instant = Some(Instant::now()); @@ -5629,10 +5812,12 @@ impl Timeline { /// true = we have generate all image layers, false = we preempt the process for L0 compaction. /// /// `partition_mode` is only for logging purpose and is not used anywhere in this function. + #[allow(clippy::too_many_arguments)] async fn create_image_layers( self: &Arc, partitioning: &KeyPartitioning, lsn: Lsn, + force_image_creation_lsn: Option, mode: ImageLayerCreationMode, ctx: &RequestContext, last_status: LastImageLayerCreationStatus, @@ -5736,7 +5921,11 @@ impl Timeline { } else if let ImageLayerCreationMode::Try = mode { // check_for_image_layers = false -> skip // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate - if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await { + if !check_for_image_layers + || !self + .time_for_new_image_layer(partition, lsn, force_image_creation_lsn) + .await + { start = img_range.end; continue; } @@ -6057,26 +6246,88 @@ impl Drop for Timeline { } } -/// Top-level failure to compact. -#[derive(Debug, thiserror::Error)] -pub(crate) enum CompactionError { - #[error("The timeline or pageserver is shutting down")] - ShuttingDown, - #[error(transparent)] - Other(anyhow::Error), -} +pub(crate) use compaction_error::CompactionError; +/// In a private mod to enforce that [`CompactionError::is_cancel`] is used +/// instead of `match`ing on [`CompactionError::ShuttingDown`]. +mod compaction_error { + use utils::sync::gate::GateError; -impl CompactionError { - /// Errors that can be ignored, i.e., cancel and shutdown. - pub fn is_cancel(&self) -> bool { - matches!(self, Self::ShuttingDown) + use crate::{ + pgdatadir_mapping::CollectKeySpaceError, + tenant::{PageReconstructError, blob_io::WriteBlobError, upload_queue::NotInitialized}, + virtual_file::owned_buffers_io::write::FlushTaskError, + }; + + /// Top-level failure to compact. Use [`Self::is_cancel`]. + #[derive(Debug, thiserror::Error)] + pub(crate) enum CompactionError { + /// Use [`Self::is_cancel`] instead of checking for this variant. + #[error("The timeline or pageserver is shutting down")] + #[allow(private_interfaces)] + ShuttingDown(ForbidMatching), // private ForbidMatching enforces use of [`Self::is_cancel`]. + #[error(transparent)] + Other(anyhow::Error), } - pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self { - if err.is_cancel() { - Self::ShuttingDown - } else { - Self::Other(err.into_anyhow()) + #[derive(Debug)] + struct ForbidMatching; + + impl CompactionError { + pub fn new_cancelled() -> Self { + Self::ShuttingDown(ForbidMatching) + } + /// Errors that can be ignored, i.e., cancel and shutdown. + pub fn is_cancel(&self) -> bool { + let other = match self { + CompactionError::ShuttingDown(_) => return true, + CompactionError::Other(other) => other, + }; + + // The write path of compaction in particular often lacks differentiated + // handling errors stemming from cancellation from other errors. + // So, if requested, we also check the ::Other variant by downcasting. + // The list below has been found empirically from flaky tests and production logs. + // The process is simple: on ::Other(), compaction will print the enclosed + // anyhow::Error in debug mode, i.e., with backtrace. That backtrace contains the + // line where the write path / compaction code does undifferentiated error handling + // from a non-anyhow type to an anyhow type. Add the type to the list of downcasts + // below, following the same is_cancel() pattern. + + let root_cause = other.root_cause(); + + let upload_queue = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + let timeline = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + let buffered_writer_flush_task_canelled = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + let write_blob_cancelled = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + let gate_closed = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + upload_queue + || timeline + || buffered_writer_flush_task_canelled + || write_blob_cancelled + || gate_closed + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + CompactionError::ShuttingDown(ForbidMatching) => anyhow::Error::new(self), + CompactionError::Other(e) => e, + } + } + pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self { + if err.is_cancel() { + Self::new_cancelled() + } else { + Self::Other(err.into_anyhow()) + } } } } @@ -6088,7 +6339,7 @@ impl From for CompactionError { CompactionError::Other(anyhow::anyhow!(value)) } super::upload_queue::NotInitialized::ShuttingDown - | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown, + | super::upload_queue::NotInitialized::Stopped => CompactionError::new_cancelled(), } } } @@ -6098,7 +6349,7 @@ impl From for CompactionError { match e { super::storage_layer::layer::DownloadError::TimelineShutdown | super::storage_layer::layer::DownloadError::DownloadCancelled => { - CompactionError::ShuttingDown + CompactionError::new_cancelled() } super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads | super::storage_layer::layer::DownloadError::DownloadRequired @@ -6117,14 +6368,14 @@ impl From for CompactionError { impl From for CompactionError { fn from(_: layer_manager::Shutdown) -> Self { - CompactionError::ShuttingDown + CompactionError::new_cancelled() } } impl From for CompactionError { fn from(e: super::storage_layer::errors::PutError) -> Self { if e.is_cancel() { - CompactionError::ShuttingDown + CompactionError::new_cancelled() } else { CompactionError::Other(e.into_anyhow()) } @@ -6223,7 +6474,7 @@ impl Timeline { let mut guard = tokio::select! { guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard, _ = self.cancel.cancelled() => { - return Err(CompactionError::ShuttingDown); + return Err(CompactionError::new_cancelled()); } }; @@ -7050,6 +7301,19 @@ impl Timeline { .unwrap() .clone() } + + /* BEGIN_HADRON */ + pub(crate) async fn compute_image_consistent_lsn(&self) -> anyhow::Result { + let guard = self + .layers + .read(LayerManagerLockHolder::ComputeImageConsistentLsn) + .await; + let layer_map = guard.layer_map()?; + let disk_consistent_lsn = self.get_disk_consistent_lsn(); + + Ok(layer_map.compute_image_consistent_lsn(disk_consistent_lsn)) + } + /* END_HADRON */ } impl Timeline { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index c263df1eb2..aa1aa937b6 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -4,6 +4,7 @@ //! //! The old legacy algorithm is implemented directly in `timeline.rs`. +use std::cmp::min; use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::ops::{Deref, Range}; use std::sync::Arc; @@ -572,8 +573,8 @@ impl GcCompactionQueue { } match res { Ok(res) => Ok(res), - Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown), - Err(CompactionError::Other(_)) => { + Err(e) if e.is_cancel() => Err(e), + Err(_) => { // There are some cases where traditional gc might collect some layer // files causing gc-compaction cannot read the full history of the key. // This needs to be resolved in the long-term by improving the compaction @@ -1260,13 +1261,16 @@ impl Timeline { // Is the timeline being deleted? if self.is_stopping() { trace!("Dropping out of compaction on timeline shutdown"); - return Err(CompactionError::ShuttingDown); + return Err(CompactionError::new_cancelled()); } let target_file_size = self.get_checkpoint_distance(); // Define partitioning schema if needed + // HADRON + let force_image_creation_lsn = self.get_force_image_creation_lsn(); + // 1. L0 Compact let l0_outcome = { let timer = self.metrics.compact_time_histo.start_timer(); @@ -1274,6 +1278,7 @@ impl Timeline { .compact_level0( target_file_size, options.flags.contains(CompactFlags::ForceL0Compaction), + force_image_creation_lsn, ctx, ) .await?; @@ -1376,6 +1381,7 @@ impl Timeline { .create_image_layers( &partitioning, lsn, + force_image_creation_lsn, mode, &image_ctx, self.last_image_layer_creation_status @@ -1472,6 +1478,41 @@ impl Timeline { Ok(CompactionOutcome::Done) } + /* BEGIN_HADRON */ + // Get the force image creation LSN based on gc_cutoff_lsn. + // Note that this is an estimation and the workload rate may suddenly change. When that happens, + // the force image creation may be too early or too late, but eventually it should be able to catch up. + pub(crate) fn get_force_image_creation_lsn(self: &Arc) -> Option { + let image_creation_period = self.get_image_layer_force_creation_period()?; + let current_lsn = self.get_last_record_lsn(); + let pitr_lsn = self.gc_info.read().unwrap().cutoffs.time?; + let pitr_interval = self.get_pitr_interval(); + if pitr_lsn == Lsn::INVALID || pitr_interval.is_zero() { + tracing::warn!( + "pitr LSN/interval not found, skipping force image creation LSN calculation" + ); + return None; + } + + let delta_lsn = current_lsn.checked_sub(pitr_lsn).unwrap().0 + * image_creation_period.as_secs() + / pitr_interval.as_secs(); + let force_image_creation_lsn = current_lsn.checked_sub(delta_lsn).unwrap_or(Lsn(0)); + + tracing::info!( + "Tenant shard {} computed force_image_creation_lsn: {}. Current lsn: {}, image_layer_force_creation_period: {:?}, GC cutoff: {}, PITR interval: {:?}", + self.tenant_shard_id, + force_image_creation_lsn, + current_lsn, + image_creation_period, + pitr_lsn, + pitr_interval + ); + + Some(force_image_creation_lsn) + } + /* END_HADRON */ + /// Check for layers that are elegible to be rewritten: /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that /// we don't indefinitely retain keys in this shard that aren't needed. @@ -1624,7 +1665,7 @@ impl Timeline { for (i, layer) in layers_to_rewrite.into_iter().enumerate() { if self.cancel.is_cancelled() { - return Err(CompactionError::ShuttingDown); + return Err(CompactionError::new_cancelled()); } info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total); @@ -1722,7 +1763,7 @@ impl Timeline { Ok(()) => {}, Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { - return Err(CompactionError::ShuttingDown); + return Err(CompactionError::new_cancelled()); } }, // Don't wait if there's L0 compaction to do. We don't need to update the outcome @@ -1801,6 +1842,7 @@ impl Timeline { self: &Arc, target_file_size: u64, force_compaction_ignore_threshold: bool, + force_compaction_lsn: Option, ctx: &RequestContext, ) -> Result { let CompactLevel0Phase1Result { @@ -1821,6 +1863,7 @@ impl Timeline { stats, target_file_size, force_compaction_ignore_threshold, + force_compaction_lsn, &ctx, ) .instrument(phase1_span) @@ -1843,6 +1886,7 @@ impl Timeline { mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, force_compaction_ignore_threshold: bool, + force_compaction_lsn: Option, ctx: &RequestContext, ) -> Result { let begin = tokio::time::Instant::now(); @@ -1872,11 +1916,28 @@ impl Timeline { return Ok(CompactLevel0Phase1Result::default()); } } else { - debug!( - level0_deltas = level0_deltas.len(), - threshold, "too few deltas to compact" - ); - return Ok(CompactLevel0Phase1Result::default()); + // HADRON + let min_lsn = level0_deltas + .iter() + .map(|a| a.get_lsn_range().start) + .reduce(min); + if force_compaction_lsn.is_some() + && min_lsn.is_some() + && min_lsn.unwrap() < force_compaction_lsn.unwrap() + { + info!( + "forcing L0 compaction of {} L0 deltas. Min lsn: {}, force compaction lsn: {}", + level0_deltas.len(), + min_lsn.unwrap(), + force_compaction_lsn.unwrap() + ); + } else { + debug!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact" + ); + return Ok(CompactLevel0Phase1Result::default()); + } } } @@ -1985,7 +2046,7 @@ impl Timeline { let mut all_keys = Vec::new(); for l in deltas_to_compact.iter() { if self.cancel.is_cancelled() { - return Err(CompactionError::ShuttingDown); + return Err(CompactionError::new_cancelled()); } let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; let keys = delta @@ -2078,7 +2139,7 @@ impl Timeline { stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); if self.cancel.is_cancelled() { - return Err(CompactionError::ShuttingDown); + return Err(CompactionError::new_cancelled()); } stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); @@ -2186,7 +2247,7 @@ impl Timeline { // avoid hitting the cancellation token on every key. in benches, we end up // shuffling an order of million keys per layer, this means we'll check it // around tens of times per layer. - return Err(CompactionError::ShuttingDown); + return Err(CompactionError::new_cancelled()); } let same_key = prev_key == Some(key); @@ -2271,7 +2332,7 @@ impl Timeline { if writer.is_none() { if self.cancel.is_cancelled() { // to be somewhat responsive to cancellation, check for each new layer - return Err(CompactionError::ShuttingDown); + return Err(CompactionError::new_cancelled()); } // Create writer if not initiaized yet writer = Some( @@ -2527,7 +2588,7 @@ impl Timeline { // Is the timeline being deleted? if self.is_stopping() { trace!("Dropping out of compaction on timeline shutdown"); - return Err(CompactionError::ShuttingDown); + return Err(CompactionError::new_cancelled()); } let (dense_ks, _sparse_ks) = self @@ -3189,7 +3250,7 @@ impl Timeline { let gc_lock = async { tokio::select! { guard = self.gc_lock.lock() => Ok(guard), - _ = cancel.cancelled() => Err(CompactionError::ShuttingDown), + _ = cancel.cancelled() => Err(CompactionError::new_cancelled()), } }; @@ -3462,7 +3523,7 @@ impl Timeline { } total_layer_size += layer.layer_desc().file_size; if cancel.is_cancelled() { - return Err(CompactionError::ShuttingDown); + return Err(CompactionError::new_cancelled()); } let should_yield = yield_for_l0 && self @@ -3609,7 +3670,7 @@ impl Timeline { } if cancel.is_cancelled() { - return Err(CompactionError::ShuttingDown); + return Err(CompactionError::new_cancelled()); } let should_yield = yield_for_l0 diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 33c97287c0..7bca66190f 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -359,14 +359,14 @@ impl Cache { Err(e) => { // Retry on tenant manager error to handle tenant split more gracefully if attempt < GET_MAX_RETRIES { - tracing::warn!( - "Fail to resolve tenant shard in attempt {}: {:?}. Retrying...", - attempt, - e - ); tokio::time::sleep(RETRY_BACKOFF).await; continue; } else { + tracing::warn!( + "Failed to resolve tenant shard after {} attempts: {:?}", + GET_MAX_RETRIES, + e + ); return Err(e); } } diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 2eccf48579..d8d81a6c91 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -47,6 +47,7 @@ pub(crate) enum LayerManagerLockHolder { ImportPgData, DetachAncestor, Eviction, + ComputeImageConsistentLsn, #[cfg(test)] Testing, } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index c6d3cafe9a..f053c9ed37 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -147,6 +147,16 @@ pub enum RedoAttemptType { GcCompaction, } +impl std::fmt::Display for RedoAttemptType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RedoAttemptType::ReadPage => write!(f, "read page"), + RedoAttemptType::LegacyCompaction => write!(f, "legacy compaction"), + RedoAttemptType::GcCompaction => write!(f, "gc compaction"), + } + } +} + /// /// Public interface of WAL redo manager /// @@ -199,6 +209,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, max_retry_attempts, + redo_attempt_type, ) .await }; @@ -221,6 +232,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, max_retry_attempts, + redo_attempt_type, ) .await } @@ -445,6 +457,7 @@ impl PostgresRedoManager { wal_redo_timeout: Duration, pg_version: PgMajorVersion, max_retry_attempts: u32, + redo_attempt_type: RedoAttemptType, ) -> Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); @@ -485,17 +498,28 @@ impl PostgresRedoManager { ); if let Err(e) = result.as_ref() { - error!( - "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", - records.len(), - records.first().map(|p| p.0).unwrap_or(Lsn(0)), - records.last().map(|p| p.0).unwrap_or(Lsn(0)), - nbytes, - base_img_lsn, - lsn, - n_attempts, - e, - ); + macro_rules! message { + ($level:tt) => { + $level!( + "error applying {} WAL records {}..{} ({} bytes) to key {} during {}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", + records.len(), + records.first().map(|p| p.0).unwrap_or(Lsn(0)), + records.last().map(|p| p.0).unwrap_or(Lsn(0)), + nbytes, + key, + redo_attempt_type, + base_img_lsn, + lsn, + n_attempts, + e, + ) + } + } + match redo_attempt_type { + RedoAttemptType::ReadPage => message!(error), + RedoAttemptType::LegacyCompaction => message!(error), + RedoAttemptType::GcCompaction => message!(warn), + } } result.map_err(Error::Other) diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index bd53855eab..158b8940a3 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -421,7 +421,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp) { if (resp->tag != T_NeonGetPageResponse && resp->tag != T_NeonErrorResponse) { - neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=%ld, ring_flush=%ld, ring_unused=%ld", + neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=" UINT64_FORMAT ", ring_flush=" UINT64_FORMAT ", ring_unused=" UINT64_FORMAT "", resp->tag, MyPState->ring_receive, MyPState->ring_flush, MyPState->ring_unused); } if (neon_protocol_version >= 3) @@ -438,7 +438,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp) getpage_resp->req.blkno != slot->buftag.blockNum) { NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Receive unexpected getpage response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", + "Receive unexpected getpage response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), slot->buftag.forkNum, slot->buftag.blockNum); } @@ -447,7 +447,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp) resp->lsn != slot->request_lsns.request_lsn || resp->not_modified_since != slot->request_lsns.not_modified_since) { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); } @@ -496,9 +496,9 @@ communicator_prefetch_pump_state(void) slot->my_ring_index != MyPState->ring_receive) { neon_shard_log(slot->shard_no, PANIC, - "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", + "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "", slot->status, slot->response, - (long) slot->my_ring_index, (long) MyPState->ring_receive); + slot->my_ring_index, MyPState->ring_receive); } /* update prefetch state */ MyPState->n_responses_buffered += 1; @@ -789,9 +789,9 @@ prefetch_read(PrefetchRequest *slot) slot->my_ring_index != MyPState->ring_receive) { neon_shard_log(slot->shard_no, PANIC, - "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu", + "Incorrect prefetch read: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "", slot->status, slot->response, - (long)slot->my_ring_index, (long)MyPState->ring_receive); + slot->my_ring_index, MyPState->ring_receive); } /* @@ -816,9 +816,9 @@ prefetch_read(PrefetchRequest *slot) slot->my_ring_index != MyPState->ring_receive) { neon_shard_log(shard_no, PANIC, - "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", + "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "", slot->status, slot->response, - (long) slot->my_ring_index, (long) MyPState->ring_receive); + slot->my_ring_index, MyPState->ring_receive); } /* update prefetch state */ @@ -852,8 +852,8 @@ prefetch_read(PrefetchRequest *slot) * and the prefetch queue was flushed during the receive call */ neon_shard_log(shard_no, LOG, - "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", - (long) my_ring_index, + "No response from reading prefetch entry " UINT64_FORMAT ": %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", + my_ring_index, RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)), buftag.forkNum, buftag.blockNum); return false; @@ -1844,7 +1844,7 @@ nm_to_string(NeonMessage *msg) NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); - appendStringInfo(&s, ", \"db_size\": %ld}", + appendStringInfo(&s, ", \"db_size\": " INT64_FORMAT "}", msg_resp->db_size); appendStringInfoChar(&s, '}'); @@ -2045,7 +2045,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r exists_resp->req.forknum != request.forknum) { NEON_PANIC_CONNECTION_STATE(0, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + "Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); } @@ -2058,14 +2058,14 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r { if (!equal_requests(resp, &request.hdr)) { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); } } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", resp->reqid, RelFileInfoFmt(rinfo), forkNum, @@ -2241,7 +2241,7 @@ Retry: case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + errmsg(NEON_TAG "[shard %d, reqid " UINT64_HEX_FORMAT "] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo), forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), errdetail("page server returned error: %s", @@ -2294,7 +2294,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns * relsize_resp->req.forknum != forknum) { NEON_PANIC_CONNECTION_STATE(0, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + "Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); } @@ -2307,14 +2307,14 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns * { if (!equal_requests(resp, &request.hdr)) { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); } } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", resp->reqid, RelFileInfoFmt(rinfo), forknum, @@ -2364,7 +2364,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns) dbsize_resp->req.dbNode != dbNode) { NEON_PANIC_CONNECTION_STATE(0, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}", + "Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); } @@ -2377,14 +2377,14 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns) { if (!equal_requests(resp, &request.hdr)) { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); } } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X", + errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read db size of db %u from page server at lsn %X/%08X", resp->reqid, dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), errdetail("page server returned error: %s", @@ -2455,7 +2455,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re slru_resp->req.segno != segno) { NEON_PANIC_CONNECTION_STATE(0, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}", + "Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno); } @@ -2469,14 +2469,14 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re { if (!equal_requests(resp, &request.hdr)) { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); } } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %llu at lsn %X/%08X", + errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read SLRU %d segment %llu at lsn %X/%08X", resp->reqid, kind, (unsigned long long) segno, diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 8cfa09bc87..2c87f139af 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -162,8 +162,34 @@ typedef struct FileCacheControl dlist_head lru; /* double linked list for LRU replacement * algorithm */ dlist_head holes; /* double linked list of punched holes */ - HyperLogLogState wss_estimation; /* estimation of working set size */ + ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */ + + /* + * Estimation of working set size. + * + * This is not guarded by the lock. No locking is needed because all the + * writes to the "registers" are simple 64-bit stores, to update a + * timestamp. We assume that: + * + * - 64-bit stores are atomic. We could enforce that by using + * pg_atomic_uint64 instead of TimestampTz as the datatype in hll.h, but + * for now we just rely on it implicitly. + * + * - Even if they're not, and there is a race between two stores, it + * doesn't matter much which one wins because they're both updating the + * register with the current timestamp. Or you have a race between + * resetting the register and updating it, in which case it also doesn't + * matter much which one wins. + * + * - If they're not atomic, you might get an occasional "torn write" if + * you're really unlucky, but we tolerate that too. It just means that + * the estimate will be a little off, until the register is updated + * again. + */ + HyperLogLogState wss_estimation; + + /* Prewarmer state */ PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS]; size_t n_prewarm_workers; size_t n_prewarm_entries; @@ -205,6 +231,8 @@ bool AmPrewarmWorker; #define LFC_ENABLED() (lfc_ctl->limit != 0) +PGDLLEXPORT void lfc_prewarm_main(Datum main_arg); + /* * Close LFC file if opened. * All backends should close their LFC files once LFC is disabled. @@ -1142,6 +1170,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + /* Update working set size estimate for the blocks */ + for (int i = 0; i < nblocks; i++) + { + tag.blockNum = blkno + i; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } + /* * For every chunk that has blocks we're interested in, we * 1. get the chunk header @@ -1220,14 +1255,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); - - /* Approximate working set for the blocks assumed in this entry */ - for (int i = 0; i < blocks_in_chunk; i++) - { - tag.blockNum = blkno + i; - addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); - } - if (entry == NULL) { /* Pages are not cached */ @@ -1504,9 +1531,15 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, return false; CopyNRelFileInfoToBufTag(tag, rinfo); + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); tag.forkNum = forknum; - CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + /* Update working set size estimate for the blocks */ + if (lfc_prewarm_update_ws_estimation) + { + tag.blockNum = blkno; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); @@ -1524,19 +1557,13 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, if (lwlsn > lsn) { - elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X", + elog(DEBUG1, "Skip LFC write for %u because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X", blkno, LSN_FORMAT_ARGS(lwlsn), LSN_FORMAT_ARGS(lsn)); LWLockRelease(lfc_lock); return false; } entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - - if (lfc_prewarm_update_ws_estimation) - { - tag.blockNum = blkno; - addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); - } if (found) { state = GET_STATE(entry, chunk_offs); @@ -1649,9 +1676,15 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, return; CopyNRelFileInfoToBufTag(tag, rinfo); + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); tag.forkNum = forkNum; - CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + /* Update working set size estimate for the blocks */ + for (int i = 0; i < nblocks; i++) + { + tag.blockNum = blkno + i; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -1692,14 +1725,6 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, cv = &lfc_ctl->cv[hash % N_COND_VARS]; entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - - /* Approximate working set for the blocks assumed in this entry */ - for (int i = 0; i < blocks_in_chunk; i++) - { - tag.blockNum = blkno + i; - addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); - } - if (found) { /* @@ -2135,40 +2160,23 @@ local_cache_pages(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } -PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds); -Datum -approximate_working_set_size_seconds(PG_FUNCTION_ARGS) +/* + * Internal implementation of the approximate_working_set_size_seconds() + * function. + */ +int32 +lfc_approximate_working_set_size_seconds(time_t duration, bool reset) { - if (lfc_size_limit != 0) - { - int32 dc; - time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0); - LWLockAcquire(lfc_lock, LW_SHARED); - dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration); - LWLockRelease(lfc_lock); - PG_RETURN_INT32(dc); - } - PG_RETURN_NULL(); -} + int32 dc; -PG_FUNCTION_INFO_V1(approximate_working_set_size); + if (lfc_size_limit == 0) + return -1; -Datum -approximate_working_set_size(PG_FUNCTION_ARGS) -{ - if (lfc_size_limit != 0) - { - int32 dc; - bool reset = PG_GETARG_BOOL(0); - LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); - dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1); - if (reset) - memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); - LWLockRelease(lfc_lock); - PG_RETURN_INT32(dc); - } - PG_RETURN_NULL(); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration); + if (reset) + memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); + return dc; } PG_FUNCTION_INFO_V1(get_local_cache_state); diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h index d5ac55d5ba..14e5d4f753 100644 --- a/pgxn/neon/file_cache.h +++ b/pgxn/neon/file_cache.h @@ -47,7 +47,8 @@ extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blk extern FileCacheState* lfc_get_state(size_t max_entries); extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers); -PGDLLEXPORT void lfc_prewarm_main(Datum main_arg); +extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset); + static inline bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 3b6c4247c3..05ba6da663 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -1410,7 +1410,7 @@ pg_init_libpagestore(void) "sharding stripe size", NULL, &stripe_size, - 32768, 1, INT_MAX, + 2048, 1, INT_MAX, PGC_SIGHUP, GUC_UNIT_BLOCKS, NULL, NULL, NULL); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 9e0ca16fed..7b749f1080 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -561,6 +561,8 @@ _PG_init(void) PG_FUNCTION_INFO_V1(pg_cluster_size); PG_FUNCTION_INFO_V1(backpressure_lsns); PG_FUNCTION_INFO_V1(backpressure_throttling_time); +PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds); +PG_FUNCTION_INFO_V1(approximate_working_set_size); Datum pg_cluster_size(PG_FUNCTION_ARGS) @@ -607,6 +609,34 @@ backpressure_throttling_time(PG_FUNCTION_ARGS) PG_RETURN_UINT64(BackpressureThrottlingTime()); } +Datum +approximate_working_set_size_seconds(PG_FUNCTION_ARGS) +{ + time_t duration; + int32 dc; + + duration = PG_ARGISNULL(0) ? (time_t) -1 : PG_GETARG_INT32(0); + + dc = lfc_approximate_working_set_size_seconds(duration, false); + if (dc < 0) + PG_RETURN_NULL(); + else + PG_RETURN_INT32(dc); +} + +Datum +approximate_working_set_size(PG_FUNCTION_ARGS) +{ + bool reset = PG_GETARG_BOOL(0); + int32 dc; + + dc = lfc_approximate_working_set_size_seconds(-1, reset); + if (dc < 0) + PG_RETURN_NULL(); + else + PG_RETURN_INT32(dc); +} + #if PG_MAJORVERSION_NUM >= 16 static void neon_shmem_startup_hook(void) diff --git a/pgxn/neon/neon_ddl_handler.c b/pgxn/neon/neon_ddl_handler.c index 2ce7b0086b..1f03e52c67 100644 --- a/pgxn/neon/neon_ddl_handler.c +++ b/pgxn/neon/neon_ddl_handler.c @@ -953,7 +953,9 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private) /* * Fire Event Trigger if both function owner and current user are - * superuser, or none of them are. + * superuser. Allow executing Event Trigger function that belongs to a + * superuser when connected as a non-superuser, even when the function is + * SECURITY DEFINER. */ else if (event == FHET_START /* still enable it to pass pg_regress tests */ @@ -976,32 +978,7 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private) function_is_owned_by_super = superuser_arg(function_owner); /* - * 1. Refuse to run SECURITY DEFINER function that belongs to a - * superuser when the current user is not a superuser itself. - */ - if (!role_is_super - && function_is_owned_by_super - && function_is_secdef) - { - char *func_name = get_func_name(flinfo->fn_oid); - - ereport(WARNING, - (errmsg("Skipping Event Trigger"), - errdetail("Event Trigger function \"%s\" is owned by \"%s\" " - "and is SECURITY DEFINER", - func_name, - GetUserNameFromId(function_owner, false)))); - - /* - * we can't skip execution directly inside the fmgr_hook so - * instead we change the event trigger function to a noop - * function. - */ - force_noop(flinfo); - } - - /* - * 2. Refuse to run functions that belongs to a non-superuser when the + * Refuse to run functions that belongs to a non-superuser when the * current user is a superuser. * * We could run a SECURITY DEFINER user-function here and be safe with @@ -1009,7 +986,7 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private) * infrastructure maintenance operations, where we prefer to skip * running user-defined code. */ - else if (role_is_super && !function_is_owned_by_super) + if (role_is_super && !function_is_owned_by_super) { char *func_name = get_func_name(flinfo->fn_oid); diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index 787bd552f8..c7574ef0f9 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -165,4 +165,8 @@ extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags); extern TimeLineID GetWALInsertionTimeLine(void); #endif +/* format codes not present in PG17-; but available in PG18+ */ +#define INT64_HEX_FORMAT "%" INT64_MODIFIER "x" +#define UINT64_HEX_FORMAT "%" INT64_MODIFIER "x" + #endif /* NEON_PGVERSIONCOMPAT_H */ diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 4b223b6b18..e3a4022664 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -376,6 +376,18 @@ typedef struct PageserverFeedback uint32 shard_number; } PageserverFeedback; +/* BEGIN_HADRON */ +typedef struct WalRateLimiter +{ + /* If the value is 1, PG backends will hit backpressure. */ + pg_atomic_uint32 should_limit; + /* The number of bytes sent in the current second. */ + uint64 sent_bytes; + /* The last recorded time in microsecond. */ + TimestampTz last_recorded_time_us; +} WalRateLimiter; +/* END_HADRON */ + typedef struct WalproposerShmemState { pg_atomic_uint64 propEpochStartLsn; @@ -395,6 +407,11 @@ typedef struct WalproposerShmemState /* aggregated feedback with min LSNs across shards */ PageserverFeedback min_ps_feedback; + + /* BEGIN_HADRON */ + /* The WAL rate limiter */ + WalRateLimiter wal_rate_limiter; + /* END_HADRON */ } WalproposerShmemState; /* diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 185fc83ace..aaf8f43eeb 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -66,6 +66,9 @@ int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; int safekeeper_proto_version = 3; char *safekeeper_conninfo_options = ""; +/* BEGIN_HADRON */ +int databricks_max_wal_mb_per_second = -1; +/* END_HADRON */ /* Set to true in the walproposer bgw. */ static bool am_walproposer; @@ -252,6 +255,18 @@ nwp_register_gucs(void) PGC_POSTMASTER, 0, NULL, NULL, NULL); + + /* BEGIN_HADRON */ + DefineCustomIntVariable( + "databricks.max_wal_mb_per_second", + "The maximum WAL MB per second allowed. If breached, sending WAL hit the backpressure. Setting to -1 disables the limit.", + NULL, + &databricks_max_wal_mb_per_second, + -1, -1, INT_MAX, + PGC_SUSET, + GUC_UNIT_MB, + NULL, NULL, NULL); + /* END_HADRON */ } @@ -393,6 +408,7 @@ assign_neon_safekeepers(const char *newval, void *extra) static uint64 backpressure_lag_impl(void) { + struct WalproposerShmemState* state = NULL; if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) { XLogRecPtr writePtr; @@ -426,6 +442,18 @@ backpressure_lag_impl(void) return (myFlushLsn - applyPtr - max_replication_apply_lag * MB); } } + + /* BEGIN_HADRON */ + if (databricks_max_wal_mb_per_second == -1) { + return 0; + } + + state = GetWalpropShmemState(); + if (state != NULL && pg_atomic_read_u32(&state->wal_rate_limiter.should_limit) == 1) + { + return 1; + } + /* END_HADRON */ return 0; } @@ -472,6 +500,9 @@ WalproposerShmemInit(void) pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0); + /* BEGIN_HADRON */ + pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0); + /* END_HADRON */ } LWLockRelease(AddinShmemInitLock); @@ -487,6 +518,9 @@ WalproposerShmemInit_SyncSafekeeper(void) pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0); pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); + /* BEGIN_HADRON */ + pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0); + /* END_HADRON */ } #define BACK_PRESSURE_DELAY 10000L // 0.01 sec @@ -521,7 +555,6 @@ backpressure_throttling_impl(void) if (lag == 0) return retry; - old_status = get_ps_display(&len); new_status = (char *) palloc(len + 64 + 1); memcpy(new_status, old_status, len); @@ -1458,6 +1491,8 @@ XLogBroadcastWalProposer(WalProposer *wp) { XLogRecPtr startptr; XLogRecPtr endptr; + struct WalproposerShmemState *state = NULL; + TimestampTz now = 0; /* Start from the last sent position */ startptr = sentPtr; @@ -1502,13 +1537,36 @@ XLogBroadcastWalProposer(WalProposer *wp) * that arbitrary LSN is eventually reported as written, flushed and * applied, so that it can measure the elapsed time. */ - LagTrackerWrite(endptr, GetCurrentTimestamp()); + now = GetCurrentTimestamp(); + LagTrackerWrite(endptr, now); /* Do we have any work to do? */ Assert(startptr <= endptr); if (endptr <= startptr) return; + /* BEGIN_HADRON */ + state = GetWalpropShmemState(); + if (databricks_max_wal_mb_per_second != -1 && state != NULL) + { + uint64 max_wal_bytes = (uint64) databricks_max_wal_mb_per_second * 1024 * 1024; + struct WalRateLimiter *limiter = &state->wal_rate_limiter; + + if (now - limiter->last_recorded_time_us > USECS_PER_SEC) + { + /* Reset the rate limiter */ + limiter->last_recorded_time_us = now; + limiter->sent_bytes = 0; + pg_atomic_exchange_u32(&limiter->should_limit, 0); + } + limiter->sent_bytes += (endptr - startptr); + if (limiter->sent_bytes > max_wal_bytes) + { + pg_atomic_exchange_u32(&limiter->should_limit, 1); + } + } + /* END_HADRON */ + WalProposerBroadcast(wp, startptr, endptr); sentPtr = endptr; diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index d37412f674..5f880dfd23 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -236,13 +236,13 @@ clear_buffer_cache(PG_FUNCTION_ARGS) bool save_neon_test_evict; /* - * Temporarily set the zenith_test_evict GUC, so that when we pin and + * Temporarily set the neon_test_evict GUC, so that when we pin and * unpin a buffer, the buffer is evicted. We use that hack to evict all * buffers, as there is no explicit "evict this buffer" function in the * buffer manager. */ - save_neon_test_evict = zenith_test_evict; - zenith_test_evict = true; + save_neon_test_evict = neon_test_evict; + neon_test_evict = true; PG_TRY(); { /* Scan through all the buffers */ @@ -273,7 +273,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS) /* * Pin the buffer, and release it again. Because we have - * zenith_test_evict==true, this will evict the page from the + * neon_test_evict==true, this will evict the page from the * buffer cache if no one else is holding a pin on it. */ if (isvalid) @@ -286,7 +286,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS) PG_FINALLY(); { /* restore the GUC */ - zenith_test_evict = save_neon_test_evict; + neon_test_evict = save_neon_test_evict; } PG_END_TRY(); diff --git a/pgxn/typedefs.list b/pgxn/typedefs.list index 760f384212..3ea8b3b091 100644 --- a/pgxn/typedefs.list +++ b/pgxn/typedefs.list @@ -2953,17 +2953,17 @@ XmlTableBuilderData YYLTYPE YYSTYPE YY_BUFFER_STATE -ZenithErrorResponse -ZenithExistsRequest -ZenithExistsResponse -ZenithGetPageRequest -ZenithGetPageResponse -ZenithMessage -ZenithMessageTag -ZenithNblocksRequest -ZenithNblocksResponse -ZenithRequest -ZenithResponse +NeonErrorResponse +NeonExistsRequest +NeonExistsResponse +NeonGetPageRequest +NeonGetPageResponse +NeonMessage +NeonMessageTag +NeonNblocksRequest +NeonNblocksResponse +NeonRequest +NeonResponse _SPI_connection _SPI_plan __AssignProcessToJobObject diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index ce8610be24..82fe6818e3 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -16,6 +16,7 @@ async-compression.workspace = true async-trait.workspace = true atomic-take.workspace = true aws-config.workspace = true +aws-credential-types.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true base64.workspace = true @@ -48,6 +49,7 @@ indexmap = { workspace = true, features = ["serde"] } ipnet.workspace = true itertools.workspace = true itoa.workspace = true +json = { path = "../libs/proxy/json" } lasso = { workspace = true, features = ["multi-threaded"] } measured = { workspace = true, features = ["lasso"] } metrics.workspace = true @@ -127,4 +129,4 @@ rstest.workspace = true walkdir.workspace = true rand_distr = "0.4" tokio-postgres.workspace = true -tracing-test = "0.2" \ No newline at end of file +tracing-test = "0.2" diff --git a/proxy/README.md b/proxy/README.md index e10ff3d710..ff48f9f323 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -123,6 +123,11 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_pl docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';" ``` +If you want to test query cancellation, redis is also required: +```sh +docker run --detach --name proxy-redis --publish 6379:6379 redis:7.0 +``` + Let's create self-signed certificate by running: ```sh openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build" @@ -130,7 +135,10 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key Then we need to build proxy with 'testing' feature and run, e.g.: ```sh -RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key +RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- \ + --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' \ + --redis-auth-type="plain" --redis-plain="redis://127.0.0.1:6379" \ + -c server.crt -k server.key ``` Now from client you can start a new session: diff --git a/proxy/src/batch.rs b/proxy/src/batch.rs index 33e08797f2..cf866ab9a3 100644 --- a/proxy/src/batch.rs +++ b/proxy/src/batch.rs @@ -7,13 +7,17 @@ use std::pin::pin; use std::sync::Mutex; use scopeguard::ScopeGuard; +use tokio::sync::oneshot; use tokio::sync::oneshot::error::TryRecvError; use crate::ext::LockExt; +type ProcResult

= Result<

::Res,

::Err>; + pub trait QueueProcessing: Send + 'static { type Req: Send + 'static; type Res: Send; + type Err: Send + Clone; /// Get the desired batch size. fn batch_size(&self, queue_size: usize) -> usize; @@ -24,7 +28,18 @@ pub trait QueueProcessing: Send + 'static { /// If this apply can error, it's expected that errors be forwarded to each Self::Res. /// /// Batching does not need to happen atomically. - fn apply(&mut self, req: Vec) -> impl Future> + Send; + fn apply( + &mut self, + req: Vec, + ) -> impl Future, Self::Err>> + Send; +} + +#[derive(thiserror::Error)] +pub enum BatchQueueError { + #[error(transparent)] + Result(E), + #[error(transparent)] + Cancelled(C), } pub struct BatchQueue { @@ -34,7 +49,7 @@ pub struct BatchQueue { struct BatchJob { req: P::Req, - res: tokio::sync::oneshot::Sender, + res: tokio::sync::oneshot::Sender>, } impl BatchQueue

{ @@ -55,11 +70,11 @@ impl BatchQueue

{ &self, req: P::Req, cancelled: impl Future, - ) -> Result { + ) -> Result> { let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req); let mut cancelled = pin!(cancelled); - let resp = loop { + let resp: Option> = loop { // try become the leader, or try wait for success. let mut processor = tokio::select! { // try become leader. @@ -72,7 +87,7 @@ impl BatchQueue

{ if inner.queue.remove(&id).is_some() { tracing::warn!("batched task cancelled before completion"); } - return Err(cancel); + return Err(BatchQueueError::Cancelled(cancel)); }, }; @@ -96,18 +111,30 @@ impl BatchQueue

{ // good: we didn't get cancelled. ScopeGuard::into_inner(cancel_safety); - if values.len() != resps.len() { - tracing::error!( - "batch: invalid response size, expected={}, got={}", - resps.len(), - values.len() - ); - } + match values { + Ok(values) => { + if values.len() != resps.len() { + tracing::error!( + "batch: invalid response size, expected={}, got={}", + resps.len(), + values.len() + ); + } - // send response values. - for (tx, value) in std::iter::zip(resps, values) { - if tx.send(value).is_err() { - // receiver hung up but that's fine. + // send response values. + for (tx, value) in std::iter::zip(resps, values) { + if tx.send(Ok(value)).is_err() { + // receiver hung up but that's fine. + } + } + } + + Err(err) => { + for tx in resps { + if tx.send(Err(err.clone())).is_err() { + // receiver hung up but that's fine. + } + } } } @@ -129,7 +156,8 @@ impl BatchQueue

{ tracing::debug!(id, "batch: job completed"); - Ok(resp.expect("no response found. batch processer should not panic")) + resp.expect("no response found. batch processer should not panic") + .map_err(BatchQueueError::Result) } } @@ -139,8 +167,8 @@ struct BatchQueueInner { } impl BatchQueueInner

{ - fn register_job(&mut self, req: P::Req) -> (u64, tokio::sync::oneshot::Receiver) { - let (tx, rx) = tokio::sync::oneshot::channel(); + fn register_job(&mut self, req: P::Req) -> (u64, oneshot::Receiver>) { + let (tx, rx) = oneshot::channel(); let id = self.version; @@ -158,7 +186,7 @@ impl BatchQueueInner

{ (id, rx) } - fn get_batch(&mut self, p: &P) -> (Vec, Vec>) { + fn get_batch(&mut self, p: &P) -> (Vec, Vec>>) { let batch_size = p.batch_size(self.queue.len()); let mut reqs = Vec::with_capacity(batch_size); let mut resps = Vec::with_capacity(batch_size); diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 691709ce2a..16a7dc7b67 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -522,15 +522,7 @@ pub async fn run() -> anyhow::Result<()> { maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); } - if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend - && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api - && let Some(client) = redis_client - { - // project info cache and invalidation of that cache. - let cache = api.caches.project_info.clone(); - maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone())); - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); - + if let Some(client) = redis_client { // Try to connect to Redis 3 times with 1 + (0..0.1) second interval. // This prevents immediate exit and pod restart, // which can cause hammering of the redis in case of connection issues. @@ -560,6 +552,16 @@ pub async fn run() -> anyhow::Result<()> { } } } + + #[allow(irrefutable_let_patterns)] + if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend + && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api + { + // project info cache and invalidation of that cache. + let cache = api.caches.project_info.clone(); + maintenance_tasks.spawn(notifications::task_main(client, cache.clone())); + maintenance_tasks.spawn(async move { cache.gc_worker().await }); + } } let maintenance = loop { diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 183e1ea449..e87cf53ab9 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -14,8 +14,8 @@ use std::time::{Duration, Instant}; use hashlink::{LruCache, linked_hash_map::RawEntryMut}; use tracing::debug; +use super::Cache; use super::common::Cached; -use super::{Cache, timed_lru}; /// An implementation of timed LRU cache with fixed capacity. /// Key properties: @@ -30,7 +30,7 @@ use super::{Cache, timed_lru}; /// /// * There's an API for immediate invalidation (removal) of a cache entry; /// It's useful in case we know for sure that the entry is no longer correct. -/// See [`timed_lru::Cached`] for more information. +/// See [`Cached`] for more information. /// /// * Expired entries are kept in the cache, until they are evicted by the LRU policy, /// or by a successful lookup (i.e. the entry hasn't expired yet). @@ -217,15 +217,18 @@ impl TimedLru { } impl TimedLru { - /// Retrieve a cached entry in convenient wrapper. - pub(crate) fn get(&self, key: &Q) -> Option> + /// Retrieve a cached entry in convenient wrapper, alongside timing information. + pub(crate) fn get_with_created_at( + &self, + key: &Q, + ) -> Option::Value, Instant)>> where K: Borrow + Clone, Q: Hash + Eq + ?Sized, { self.get_raw(key, |key, entry| Cached { token: Some((self, key.clone())), - value: entry.value.clone(), + value: (entry.value.clone(), entry.created_at), }) } } diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 74413f1a7d..77062d3bb4 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -4,12 +4,11 @@ use std::pin::pin; use std::sync::{Arc, OnceLock}; use std::time::Duration; -use anyhow::anyhow; use futures::FutureExt; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; use postgres_client::RawCancelToken; use postgres_client::tls::MakeTlsConnect; -use redis::{Cmd, FromRedisValue, Value}; +use redis::{Cmd, FromRedisValue, SetExpiry, SetOptions, Value}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; @@ -18,7 +17,7 @@ use tracing::{debug, error, info}; use crate::auth::AuthError; use crate::auth::backend::ComputeUserInfo; -use crate::batch::{BatchQueue, QueueProcessing}; +use crate::batch::{BatchQueue, BatchQueueError, QueueProcessing}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::ControlPlaneApi; @@ -28,23 +27,39 @@ use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, Redis use crate::pqproto::CancelKeyData; use crate::rate_limiter::LeakyBucketRateLimiter; use crate::redis::keys::KeyPrefix; -use crate::redis::kv_ops::RedisKVClient; +use crate::redis::kv_ops::{RedisKVClient, RedisKVClientError}; +use crate::util::run_until; type IpSubnetKey = IpNet; -const CANCEL_KEY_TTL: std::time::Duration = std::time::Duration::from_secs(600); -const CANCEL_KEY_REFRESH: std::time::Duration = std::time::Duration::from_secs(570); +const CANCEL_KEY_TTL: Duration = Duration::from_secs(600); +const CANCEL_KEY_REFRESH: Duration = Duration::from_secs(570); // Message types for sending through mpsc channel pub enum CancelKeyOp { - StoreCancelKey { + Store { key: CancelKeyData, value: Box, - expire: std::time::Duration, + expire: Duration, }, - GetCancelData { + Refresh { + key: CancelKeyData, + expire: Duration, + }, + Get { key: CancelKeyData, }, + GetOld { + key: CancelKeyData, + }, +} + +#[derive(thiserror::Error, Debug, Clone)] +pub enum PipelineError { + #[error("could not send cmd to redis: {0}")] + RedisKVClient(Arc), + #[error("incorrect number of responses from redis")] + IncorrectNumberOfResponses, } pub struct Pipeline { @@ -60,7 +75,7 @@ impl Pipeline { } } - async fn execute(self, client: &mut RedisKVClient) -> Vec> { + async fn execute(self, client: &mut RedisKVClient) -> Result, PipelineError> { let responses = self.replies; let batch_size = self.inner.len(); @@ -78,43 +93,44 @@ impl Pipeline { batch_size, responses, "successfully completed cancellation jobs", ); - values.into_iter().map(Ok).collect() + Ok(values.into_iter().collect()) } Ok(value) => { error!(batch_size, ?value, "unexpected redis return value"); - std::iter::repeat_with(|| Err(anyhow!("incorrect response type from redis"))) - .take(responses) - .collect() - } - Err(err) => { - std::iter::repeat_with(|| Err(anyhow!("could not send cmd to redis: {err}"))) - .take(responses) - .collect() + Err(PipelineError::IncorrectNumberOfResponses) } + Err(err) => Err(PipelineError::RedisKVClient(Arc::new(err))), } } - fn add_command_with_reply(&mut self, cmd: Cmd) { + fn add_command(&mut self, cmd: Cmd) { self.inner.add_command(cmd); self.replies += 1; } - - fn add_command_no_reply(&mut self, cmd: Cmd) { - self.inner.add_command(cmd).ignore(); - } } impl CancelKeyOp { fn register(&self, pipe: &mut Pipeline) { match self { - CancelKeyOp::StoreCancelKey { key, value, expire } => { + CancelKeyOp::Store { key, value, expire } => { let key = KeyPrefix::Cancel(*key).build_redis_key(); - pipe.add_command_with_reply(Cmd::hset(&key, "data", &**value)); - pipe.add_command_no_reply(Cmd::expire(&key, expire.as_secs() as i64)); + pipe.add_command(Cmd::set_options( + &key, + &**value, + SetOptions::default().with_expiration(SetExpiry::EX(expire.as_secs())), + )); } - CancelKeyOp::GetCancelData { key } => { + CancelKeyOp::Refresh { key, expire } => { let key = KeyPrefix::Cancel(*key).build_redis_key(); - pipe.add_command_with_reply(Cmd::hget(key, "data")); + pipe.add_command(Cmd::expire(&key, expire.as_secs() as i64)); + } + CancelKeyOp::GetOld { key } => { + let key = KeyPrefix::Cancel(*key).build_redis_key(); + pipe.add_command(Cmd::hget(key, "data")); + } + CancelKeyOp::Get { key } => { + let key = KeyPrefix::Cancel(*key).build_redis_key(); + pipe.add_command(Cmd::get(key)); } } } @@ -127,13 +143,14 @@ pub struct CancellationProcessor { impl QueueProcessing for CancellationProcessor { type Req = (CancelChannelSizeGuard<'static>, CancelKeyOp); - type Res = anyhow::Result; + type Res = redis::Value; + type Err = PipelineError; fn batch_size(&self, _queue_size: usize) -> usize { self.batch_size } - async fn apply(&mut self, batch: Vec) -> Vec { + async fn apply(&mut self, batch: Vec) -> Result, Self::Err> { if !self.client.credentials_refreshed() { // this will cause a timeout for cancellation operations tracing::debug!( @@ -244,18 +261,18 @@ impl CancellationHandler { &self, key: CancelKeyData, ) -> Result, CancelError> { - let guard = Metrics::get() - .proxy - .cancel_channel_size - .guard(RedisMsgKind::HGet); - let op = CancelKeyOp::GetCancelData { key }; + const TIMEOUT: Duration = Duration::from_secs(5); let Some(tx) = self.tx.get() else { tracing::warn!("cancellation handler is not available"); return Err(CancelError::InternalError); }; - const TIMEOUT: Duration = Duration::from_secs(5); + let guard = Metrics::get() + .proxy + .cancel_channel_size + .guard(RedisMsgKind::Get); + let op = CancelKeyOp::Get { key }; let result = timeout( TIMEOUT, tx.call((guard, op), std::future::pending::()), @@ -264,10 +281,37 @@ impl CancellationHandler { .map_err(|_| { tracing::warn!("timed out waiting to receive GetCancelData response"); CancelError::RateLimit - })? - // cannot be cancelled - .unwrap_or_else(|x| match x {}) - .map_err(|e| { + })?; + + // We may still have cancel keys set with HSET "data". + // Check error type and retry with HGET. + // TODO: remove code after HSET is not used anymore. + let result = if let Err(err) = result.as_ref() + && let BatchQueueError::Result(err) = err + && let PipelineError::RedisKVClient(err) = err + && let RedisKVClientError::Redis(err) = &**err + && let Some(errcode) = err.code() + && errcode == "WRONGTYPE" + { + let guard = Metrics::get() + .proxy + .cancel_channel_size + .guard(RedisMsgKind::HGet); + let op = CancelKeyOp::GetOld { key }; + timeout( + TIMEOUT, + tx.call((guard, op), std::future::pending::()), + ) + .await + .map_err(|_| { + tracing::warn!("timed out waiting to receive GetCancelData response"); + CancelError::RateLimit + })? + } else { + result + }; + + let result = result.map_err(|e| { tracing::warn!("failed to receive GetCancelData response: {e}"); CancelError::InternalError })?; @@ -438,39 +482,94 @@ impl Session { let mut cancel = pin!(cancel); + enum State { + Set, + Refresh, + } + let mut state = State::Set; + loop { - let guard = Metrics::get() - .proxy - .cancel_channel_size - .guard(RedisMsgKind::HSet); - let op = CancelKeyOp::StoreCancelKey { - key: self.key, - value: closure_json.clone(), - expire: CANCEL_KEY_TTL, + let guard_op = match state { + State::Set => { + let guard = Metrics::get() + .proxy + .cancel_channel_size + .guard(RedisMsgKind::Set); + let op = CancelKeyOp::Store { + key: self.key, + value: closure_json.clone(), + expire: CANCEL_KEY_TTL, + }; + tracing::debug!( + src=%self.key, + dest=?cancel_closure.cancel_token, + "registering cancellation key" + ); + (guard, op) + } + + State::Refresh => { + let guard = Metrics::get() + .proxy + .cancel_channel_size + .guard(RedisMsgKind::Expire); + let op = CancelKeyOp::Refresh { + key: self.key, + expire: CANCEL_KEY_TTL, + }; + tracing::debug!( + src=%self.key, + dest=?cancel_closure.cancel_token, + "refreshing cancellation key" + ); + (guard, op) + } }; - tracing::debug!( - src=%self.key, - dest=?cancel_closure.cancel_token, - "registering cancellation key" - ); - - match tx.call((guard, op), cancel.as_mut()).await { - Ok(Ok(_)) => { + match tx.call(guard_op, cancel.as_mut()).await { + // SET returns OK + Ok(Value::Okay) => { tracing::debug!( src=%self.key, dest=?cancel_closure.cancel_token, "registered cancellation key" ); + state = State::Refresh; + } - // wait before continuing. - tokio::time::sleep(CANCEL_KEY_REFRESH).await; + // EXPIRE returns 1 + Ok(Value::Int(1)) => { + tracing::debug!( + src=%self.key, + dest=?cancel_closure.cancel_token, + "refreshed cancellation key" + ); } + + Ok(_) => { + // Any other response likely means the key expired. + tracing::warn!(src=%self.key, "refreshing cancellation key failed"); + // Re-enter the SET loop to repush full data. + state = State::Set; + } + // retry immediately. - Ok(Err(error)) => { - tracing::warn!(?error, "error registering cancellation key"); + Err(BatchQueueError::Result(error)) => { + tracing::warn!(?error, "error refreshing cancellation key"); + // Small delay to prevent busy loop with high cpu and logging. + tokio::time::sleep(Duration::from_millis(10)).await; + continue; } - Err(Err(_cancelled)) => break, + + Err(BatchQueueError::Cancelled(Err(_cancelled))) => break, + } + + // wait before continuing. break immediately if cancelled. + if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut()) + .await + .is_err() + { + break; } } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index b55cc14532..4d8df19476 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -267,7 +267,7 @@ async fn worker_inner( ) -> anyhow::Result<()> { #[cfg(any(test, feature = "testing"))] let storage = if config.test_remote_failures > 0 { - GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures) + GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures, 100) } else { storage }; diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index fc263b73b1..bb785b8b0c 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -23,12 +23,13 @@ use crate::control_plane::errors::{ ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, }; use crate::control_plane::locks::ApiLocks; -use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse}; use crate::control_plane::{ AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, EndpointAccessControl, NodeInfo, RoleAccessControl, }; use crate::metrics::Metrics; +use crate::proxy::retry::CouldRetry; use crate::rate_limiter::WakeComputeRateLimiter; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; use crate::{compute, http, scram}; @@ -382,16 +383,31 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { macro_rules! check_cache { () => { - if let Some(cached) = self.caches.node_info.get(&key) { - let (cached, info) = cached.take_value(); - let info = info.map_err(|c| { - info!(key = &*key, "found cached wake_compute error"); - WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c))) - })?; + if let Some(cached) = self.caches.node_info.get_with_created_at(&key) { + let (cached, (info, created_at)) = cached.take_value(); + return match info { + Err(mut msg) => { + info!(key = &*key, "found cached wake_compute error"); - debug!(key = &*key, "found cached compute node info"); - ctx.set_project(info.aux.clone()); - return Ok(cached.map(|()| info)); + // if retry_delay_ms is set, reduce it by the amount of time it spent in cache + if let Some(status) = &mut msg.status { + if let Some(retry_info) = &mut status.details.retry_info { + retry_info.retry_delay_ms = retry_info + .retry_delay_ms + .saturating_sub(created_at.elapsed().as_millis() as u64) + } + } + + Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( + msg, + ))) + } + Ok(info) => { + debug!(key = &*key, "found cached compute node info"); + ctx.set_project(info.aux.clone()); + Ok(cached.map(|()| info)) + } + }; } }; } @@ -434,42 +450,29 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { Ok(cached.map(|()| node)) } Err(err) => match err { - WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => { - let Some(status) = &err.status else { - return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( - err, - ))); - }; + WakeComputeError::ControlPlane(ControlPlaneError::Message(ref msg)) => { + let retry_info = msg.status.as_ref().and_then(|s| s.details.retry_info); - let reason = status - .details - .error_info - .map_or(Reason::Unknown, |x| x.reason); - - // if we can retry this error, do not cache it. - if reason.can_retry() { - return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( - err, - ))); + // If we can retry this error, do not cache it, + // unless we were given a retry delay. + if msg.could_retry() && retry_info.is_none() { + return Err(err); } - // at this point, we should only have quota errors. debug!( key = &*key, "created a cache entry for the wake compute error" ); - self.caches.node_info.insert_ttl( - key, - Err(err.clone()), - Duration::from_secs(30), - ); + let ttl = retry_info.map_or(Duration::from_secs(30), |r| { + Duration::from_millis(r.retry_delay_ms) + }); - Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( - err, - ))) + self.caches.node_info.insert_ttl(key, Err(msg.clone()), ttl); + + Err(err) } - err => return Err(err), + err => Err(err), }, } } diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs index f640657d90..12843e48c7 100644 --- a/proxy/src/control_plane/errors.rs +++ b/proxy/src/control_plane/errors.rs @@ -43,28 +43,35 @@ impl UserFacingError for ControlPlaneError { } impl ReportableError for ControlPlaneError { - fn get_error_kind(&self) -> crate::error::ErrorKind { + fn get_error_kind(&self) -> ErrorKind { match self { ControlPlaneError::Message(e) => match e.get_reason() { - Reason::RoleProtected => ErrorKind::User, - Reason::ResourceNotFound => ErrorKind::User, - Reason::ProjectNotFound => ErrorKind::User, - Reason::EndpointNotFound => ErrorKind::User, - Reason::BranchNotFound => ErrorKind::User, + Reason::RoleProtected + | Reason::ResourceNotFound + | Reason::ProjectNotFound + | Reason::EndpointNotFound + | Reason::EndpointDisabled + | Reason::BranchNotFound + | Reason::InvalidEphemeralEndpointOptions => ErrorKind::User, + Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit, - Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota, - Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota, - Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota, - Reason::WrittenDataQuotaExceeded => ErrorKind::Quota, - Reason::DataTransferQuotaExceeded => ErrorKind::Quota, - Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota, - Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane, - Reason::LockAlreadyTaken => ErrorKind::ControlPlane, - Reason::RunningOperations => ErrorKind::ControlPlane, - Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane, - Reason::Unknown => ErrorKind::ControlPlane, + + Reason::NonDefaultBranchComputeTimeExceeded + | Reason::ActiveTimeQuotaExceeded + | Reason::ComputeTimeQuotaExceeded + | Reason::WrittenDataQuotaExceeded + | Reason::DataTransferQuotaExceeded + | Reason::LogicalSizeQuotaExceeded + | Reason::ActiveEndpointsLimitExceeded => ErrorKind::Quota, + + Reason::ConcurrencyLimitReached + | Reason::LockAlreadyTaken + | Reason::RunningOperations + | Reason::EndpointIdle + | Reason::ProjectUnderMaintenance + | Reason::Unknown => ErrorKind::ControlPlane, }, - ControlPlaneError::Transport(_) => crate::error::ErrorKind::ControlPlane, + ControlPlaneError::Transport(_) => ErrorKind::ControlPlane, } } } @@ -120,10 +127,10 @@ impl UserFacingError for GetAuthInfoError { } impl ReportableError for GetAuthInfoError { - fn get_error_kind(&self) -> crate::error::ErrorKind { + fn get_error_kind(&self) -> ErrorKind { match self { - Self::BadSecret => crate::error::ErrorKind::ControlPlane, - Self::ApiError(_) => crate::error::ErrorKind::ControlPlane, + Self::BadSecret => ErrorKind::ControlPlane, + Self::ApiError(_) => ErrorKind::ControlPlane, } } } diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index f0314f91f0..cf193ed268 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -126,10 +126,16 @@ pub(crate) enum Reason { /// or that the subject doesn't have enough permissions to access the requested endpoint. #[serde(rename = "ENDPOINT_NOT_FOUND")] EndpointNotFound, + /// EndpointDisabled indicates that the endpoint has been disabled and does not accept connections. + #[serde(rename = "ENDPOINT_DISABLED")] + EndpointDisabled, /// BranchNotFound indicates that the branch wasn't found, usually due to the provided ID not being correct, /// or that the subject doesn't have enough permissions to access the requested branch. #[serde(rename = "BRANCH_NOT_FOUND")] BranchNotFound, + /// InvalidEphemeralEndpointOptions indicates that the specified LSN or timestamp are wrong. + #[serde(rename = "INVALID_EPHEMERAL_OPTIONS")] + InvalidEphemeralEndpointOptions, /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded. #[serde(rename = "RATE_LIMIT_EXCEEDED")] RateLimitExceeded, @@ -152,6 +158,9 @@ pub(crate) enum Reason { /// LogicalSizeQuotaExceeded indicates that the logical size quota was exceeded. #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")] LogicalSizeQuotaExceeded, + /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded. + #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")] + ActiveEndpointsLimitExceeded, /// RunningOperations indicates that the project already has some running operations /// and scheduling of new ones is prohibited. #[serde(rename = "RUNNING_OPERATIONS")] @@ -162,9 +171,13 @@ pub(crate) enum Reason { /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken. #[serde(rename = "LOCK_ALREADY_TAKEN")] LockAlreadyTaken, - /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded. - #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")] - ActiveEndpointsLimitExceeded, + /// EndpointIdle indicates that the endpoint cannot become active, because it's idle. + #[serde(rename = "ENDPOINT_IDLE")] + EndpointIdle, + /// ProjectUnderMaintenance indicates that the project is currently ongoing maintenance, + /// and thus cannot accept connections. + #[serde(rename = "PROJECT_UNDER_MAINTENANCE")] + ProjectUnderMaintenance, #[default] #[serde(other)] Unknown, @@ -184,13 +197,15 @@ impl Reason { pub(crate) fn can_retry(self) -> bool { match self { // do not retry role protected errors - // not a transitive error + // not a transient error Reason::RoleProtected => false, - // on retry, it will still not be found + // on retry, it will still not be found or valid Reason::ResourceNotFound | Reason::ProjectNotFound | Reason::EndpointNotFound - | Reason::BranchNotFound => false, + | Reason::EndpointDisabled + | Reason::BranchNotFound + | Reason::InvalidEphemeralEndpointOptions => false, // we were asked to go away Reason::RateLimitExceeded | Reason::NonDefaultBranchComputeTimeExceeded @@ -200,11 +215,13 @@ impl Reason { | Reason::DataTransferQuotaExceeded | Reason::LogicalSizeQuotaExceeded | Reason::ActiveEndpointsLimitExceeded => false, - // transitive error. control plane is currently busy + // transient error. control plane is currently busy // but might be ready soon Reason::RunningOperations | Reason::ConcurrencyLimitReached - | Reason::LockAlreadyTaken => true, + | Reason::LockAlreadyTaken + | Reason::EndpointIdle + | Reason::ProjectUnderMaintenance => true, // unknown error. better not retry it. Reason::Unknown => false, } diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 9d1a3d4358..bf4d5a11eb 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -374,11 +374,10 @@ pub enum Waiting { #[label(singleton = "kind")] #[allow(clippy::enum_variant_names)] pub enum RedisMsgKind { - HSet, - HSetMultiple, + Set, + Get, + Expire, HGet, - HGetAll, - HDel, } #[derive(Default, Clone)] diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 08c81afa04..02651109e0 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -195,15 +195,18 @@ impl NeonOptions { // proxy options: /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute. - pub const PARAMS_COMPAT: &str = "proxy_params_compat"; + pub const PARAMS_COMPAT: &'static str = "proxy_params_compat"; // cplane options: /// `LSN` allows provisioning an ephemeral compute with time-travel to the provided LSN. - const LSN: &str = "lsn"; + const LSN: &'static str = "lsn"; + + /// `TIMESTAMP` allows provisioning an ephemeral compute with time-travel to the provided timestamp. + const TIMESTAMP: &'static str = "timestamp"; /// `ENDPOINT_TYPE` allows configuring an ephemeral compute to be read_only or read_write. - const ENDPOINT_TYPE: &str = "endpoint_type"; + const ENDPOINT_TYPE: &'static str = "endpoint_type"; pub(crate) fn parse_params(params: &StartupMessageParams) -> Self { params @@ -228,6 +231,7 @@ impl NeonOptions { // This is not a cplane option, we know it does not create ephemeral computes. Self::PARAMS_COMPAT => false, Self::LSN => true, + Self::TIMESTAMP => true, Self::ENDPOINT_TYPE => true, // err on the side of caution. any cplane options we don't know about // might lead to ephemeral computes. diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 35a3fe4334..b0bf332e44 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -4,11 +4,12 @@ use std::time::Duration; use futures::FutureExt; use redis::aio::{ConnectionLike, MultiplexedConnection}; -use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult}; +use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisError, RedisResult}; use tokio::task::AbortHandle; use tracing::{error, info, warn}; use super::elasticache::CredentialsProvider; +use crate::redis::elasticache::CredentialsProviderError; enum Credentials { Static(ConnectionInfo), @@ -26,6 +27,14 @@ impl Clone for Credentials { } } +#[derive(thiserror::Error, Debug)] +pub enum ConnectionProviderError { + #[error(transparent)] + Redis(#[from] RedisError), + #[error(transparent)] + CredentialsProvider(#[from] CredentialsProviderError), +} + /// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token. /// Provides PubSub connection without credentials refresh. pub struct ConnectionWithCredentialsProvider { @@ -86,15 +95,18 @@ impl ConnectionWithCredentialsProvider { } } - async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> { - redis::cmd("PING").query_async(con).await + async fn ping(con: &mut MultiplexedConnection) -> Result<(), ConnectionProviderError> { + redis::cmd("PING") + .query_async(con) + .await + .map_err(Into::into) } pub(crate) fn credentials_refreshed(&self) -> bool { self.credentials_refreshed.load(Ordering::Relaxed) } - pub(crate) async fn connect(&mut self) -> anyhow::Result<()> { + pub(crate) async fn connect(&mut self) -> Result<(), ConnectionProviderError> { let _guard = self.mutex.lock().await; if let Some(con) = self.con.as_mut() { match Self::ping(con).await { @@ -141,7 +153,7 @@ impl ConnectionWithCredentialsProvider { Ok(()) } - async fn get_connection_info(&self) -> anyhow::Result { + async fn get_connection_info(&self) -> Result { match &self.credentials { Credentials::Static(info) => Ok(info.clone()), Credentials::Dynamic(provider, addr) => { @@ -160,7 +172,7 @@ impl ConnectionWithCredentialsProvider { } } - async fn get_client(&self) -> anyhow::Result { + async fn get_client(&self) -> Result { let client = redis::Client::open(self.get_connection_info().await?)?; self.credentials_refreshed.store(true, Ordering::Relaxed); Ok(client) diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs index 58e3c889a7..6f3b34d381 100644 --- a/proxy/src/redis/elasticache.rs +++ b/proxy/src/redis/elasticache.rs @@ -9,10 +9,12 @@ use aws_config::meta::region::RegionProviderChain; use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; +use aws_credential_types::provider::error::CredentialsError; use aws_sdk_iam::config::ProvideCredentials; use aws_sigv4::http_request::{ - self, SignableBody, SignableRequest, SignatureLocation, SigningSettings, + self, SignableBody, SignableRequest, SignatureLocation, SigningError, SigningSettings, }; +use aws_sigv4::sign::v4::signing_params::BuildError; use tracing::info; #[derive(Debug)] @@ -40,6 +42,18 @@ impl AWSIRSAConfig { } } +#[derive(thiserror::Error, Debug)] +pub enum CredentialsProviderError { + #[error(transparent)] + AwsCredentials(#[from] CredentialsError), + #[error(transparent)] + AwsSigv4Build(#[from] BuildError), + #[error(transparent)] + AwsSigv4Singing(#[from] SigningError), + #[error(transparent)] + Http(#[from] http::Error), +} + /// Credentials provider for AWS elasticache authentication. /// /// Official documentation: @@ -92,7 +106,9 @@ impl CredentialsProvider { }) } - pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> { + pub(crate) async fn provide_credentials( + &self, + ) -> Result<(String, String), CredentialsProviderError> { let aws_credentials = self .credentials_provider .provide_credentials() diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index cfdbc21839..d1e97b6b09 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -2,9 +2,18 @@ use std::time::Duration; use futures::FutureExt; use redis::aio::ConnectionLike; -use redis::{Cmd, FromRedisValue, Pipeline, RedisResult}; +use redis::{Cmd, FromRedisValue, Pipeline, RedisError, RedisResult}; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use crate::redis::connection_with_credentials_provider::ConnectionProviderError; + +#[derive(thiserror::Error, Debug)] +pub enum RedisKVClientError { + #[error(transparent)] + Redis(#[from] RedisError), + #[error(transparent)] + ConnectionProvider(#[from] ConnectionProviderError), +} pub struct RedisKVClient { client: ConnectionWithCredentialsProvider, @@ -32,12 +41,13 @@ impl RedisKVClient { Self { client } } - pub async fn try_connect(&mut self) -> anyhow::Result<()> { + pub async fn try_connect(&mut self) -> Result<(), RedisKVClientError> { self.client .connect() .boxed() .await .inspect_err(|e| tracing::error!("failed to connect to redis: {e}")) + .map_err(Into::into) } pub(crate) fn credentials_refreshed(&self) -> bool { @@ -47,7 +57,7 @@ impl RedisKVClient { pub(crate) async fn query( &mut self, q: &impl Queryable, - ) -> anyhow::Result { + ) -> Result { let e = match q.query(&mut self.client).await { Ok(t) => return Ok(t), Err(e) => e, diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 2e67d07079..ef7c8a4d82 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -1,6 +1,7 @@ +use json::{ListSer, ObjectSer, ValueSer}; use postgres_client::Row; use postgres_client::types::{Kind, Type}; -use serde_json::{Map, Value}; +use serde_json::Value; // // Convert json non-string types to strings, so that they can be passed to Postgres @@ -74,44 +75,40 @@ pub(crate) enum JsonConversionError { UnbalancedString, } -enum OutputMode { - Array(Vec), - Object(Map), +enum OutputMode<'a> { + Array(ListSer<'a>), + Object(ObjectSer<'a>), } -impl OutputMode { - fn key(&mut self, key: &str) -> &mut Value { +impl OutputMode<'_> { + fn key(&mut self, key: &str) -> ValueSer<'_> { match self { - OutputMode::Array(values) => push_entry(values, Value::Null), - OutputMode::Object(map) => map.entry(key.to_string()).or_insert(Value::Null), + OutputMode::Array(values) => values.entry(), + OutputMode::Object(map) => map.key(key), } } - fn finish(self) -> Value { + fn finish(self) { match self { - OutputMode::Array(values) => Value::Array(values), - OutputMode::Object(map) => Value::Object(map), + OutputMode::Array(values) => values.finish(), + OutputMode::Object(map) => map.finish(), } } } -fn push_entry(arr: &mut Vec, t: T) -> &mut T { - arr.push(t); - arr.last_mut().expect("a value was just inserted") -} - // // Convert postgres row with text-encoded values to JSON object // pub(crate) fn pg_text_row_to_json( + output: ValueSer, row: &Row, raw_output: bool, array_mode: bool, -) -> Result { +) -> Result<(), JsonConversionError> { let mut entries = if array_mode { - OutputMode::Array(Vec::with_capacity(row.columns().len())) + OutputMode::Array(output.list()) } else { - OutputMode::Object(Map::with_capacity(row.columns().len())) + OutputMode::Object(output.object()) }; for (i, column) in row.columns().iter().enumerate() { @@ -120,53 +117,48 @@ pub(crate) fn pg_text_row_to_json( let value = entries.key(column.name()); match pg_value { - Some(v) if raw_output => *value = Value::String(v.to_string()), + Some(v) if raw_output => value.value(v), Some(v) => pg_text_to_json(value, v, column.type_())?, - None => *value = Value::Null, + None => value.value(json::Null), } } - Ok(entries.finish()) + entries.finish(); + Ok(()) } // // Convert postgres text-encoded value to JSON value // -fn pg_text_to_json( - output: &mut Value, - val: &str, - pg_type: &Type, -) -> Result<(), JsonConversionError> { +fn pg_text_to_json(output: ValueSer, val: &str, pg_type: &Type) -> Result<(), JsonConversionError> { if let Kind::Array(elem_type) = pg_type.kind() { // todo: we should fetch this from postgres. let delimiter = ','; - let mut array = vec![]; - pg_array_parse(&mut array, val, elem_type, delimiter)?; - *output = Value::Array(array); + json::value_as_list!(|output| pg_array_parse(output, val, elem_type, delimiter)?); return Ok(()); } match *pg_type { - Type::BOOL => *output = Value::Bool(val == "t"), + Type::BOOL => output.value(val == "t"), Type::INT2 | Type::INT4 => { let val = val.parse::()?; - *output = Value::Number(serde_json::Number::from(val)); + output.value(val); } Type::FLOAT4 | Type::FLOAT8 => { let fval = val.parse::()?; - let num = serde_json::Number::from_f64(fval); - if let Some(num) = num { - *output = Value::Number(num); + if fval.is_finite() { + output.value(fval); } else { // Pass Nan, Inf, -Inf as strings // JS JSON.stringify() does converts them to null, but we // want to preserve them, so we pass them as strings - *output = Value::String(val.to_string()); + output.value(val); } } - Type::JSON | Type::JSONB => *output = serde_json::from_str(val)?, - _ => *output = Value::String(val.to_string()), + // we assume that the string value is valid json. + Type::JSON | Type::JSONB => output.write_raw_json(val.as_bytes()), + _ => output.value(val), } Ok(()) @@ -192,7 +184,7 @@ fn pg_text_to_json( /// gets its own level of curly braces, and delimiters must be written between adjacent /// curly-braced entities of the same level. fn pg_array_parse( - elements: &mut Vec, + elements: &mut ListSer, mut pg_array: &str, elem: &Type, delim: char, @@ -221,7 +213,7 @@ fn pg_array_parse( /// reads a single array from the `pg_array` string and pushes each values to `elements`. /// returns the rest of the `pg_array` string that was not read. fn pg_array_parse_inner<'a>( - elements: &mut Vec, + elements: &mut ListSer, mut pg_array: &'a str, elem: &Type, delim: char, @@ -234,7 +226,7 @@ fn pg_array_parse_inner<'a>( let mut q = String::new(); loop { - let value = push_entry(elements, Value::Null); + let value = elements.entry(); pg_array = pg_array_parse_item(value, &mut q, pg_array, elem, delim)?; // check for separator. @@ -260,7 +252,7 @@ fn pg_array_parse_inner<'a>( /// /// `quoted` is a scratch allocation that has no defined output. fn pg_array_parse_item<'a>( - output: &mut Value, + output: ValueSer, quoted: &mut String, mut pg_array: &'a str, elem: &Type, @@ -276,9 +268,8 @@ fn pg_array_parse_item<'a>( if pg_array.starts_with('{') { // nested array. - let mut nested = vec![]; - pg_array = pg_array_parse_inner(&mut nested, pg_array, elem, delim)?; - *output = Value::Array(nested); + pg_array = + json::value_as_list!(|output| pg_array_parse_inner(output, pg_array, elem, delim))?; return Ok(pg_array); } @@ -306,7 +297,7 @@ fn pg_array_parse_item<'a>( // we might have an item string: // check for null if item == "NULL" { - *output = Value::Null; + output.value(json::Null); } else { pg_text_to_json(output, item, elem)?; } @@ -440,15 +431,15 @@ mod tests { } fn pg_text_to_json(val: &str, pg_type: &Type) -> Value { - let mut v = Value::Null; - super::pg_text_to_json(&mut v, val, pg_type).unwrap(); - v + let output = json::value_to_string!(|v| super::pg_text_to_json(v, val, pg_type).unwrap()); + serde_json::from_str(&output).unwrap() } fn pg_array_parse(pg_array: &str, pg_type: &Type) -> Value { - let mut array = vec![]; - super::pg_array_parse(&mut array, pg_array, pg_type, ',').unwrap(); - Value::Array(array) + let output = json::value_to_string!(|v| json::value_as_list!(|v| { + super::pg_array_parse(v, pg_array, pg_type, ',').unwrap(); + })); + serde_json::from_str(&output).unwrap() } #[test] diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 7a718d0280..8a14f804b6 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -14,10 +14,7 @@ use hyper::http::{HeaderName, HeaderValue}; use hyper::{Request, Response, StatusCode, header}; use indexmap::IndexMap; use postgres_client::error::{DbError, ErrorPosition, SqlState}; -use postgres_client::{ - GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, RowStream, Transaction, -}; -use serde::Serialize; +use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use serde_json::Value; use serde_json::value::RawValue; use tokio::time::{self, Instant}; @@ -687,32 +684,21 @@ impl QueryData { let (inner, mut discard) = client.inner(); let cancel_token = inner.cancel_token(); - match select( + let mut json_buf = vec![]; + + let batch_result = match select( pin!(query_to_json( config, &mut *inner, self, - &mut 0, + json::ValueSer::new(&mut json_buf), parsed_headers )), pin!(cancel.cancelled()), ) .await { - // The query successfully completed. - Either::Left((Ok((status, results)), __not_yet_cancelled)) => { - discard.check_idle(status); - - let json_output = - serde_json::to_string(&results).expect("json serialization should not fail"); - Ok(json_output) - } - // The query failed with an error - Either::Left((Err(e), __not_yet_cancelled)) => { - discard.discard(); - Err(e) - } - // The query was cancelled. + Either::Left((res, __not_yet_cancelled)) => res, Either::Right((_cancelled, query)) => { tracing::info!("cancelling query"); if let Err(err) = cancel_token.cancel_query(NoTls).await { @@ -721,13 +707,7 @@ impl QueryData { // wait for the query cancellation match time::timeout(time::Duration::from_millis(100), query).await { // query successed before it was cancelled. - Ok(Ok((status, results))) => { - discard.check_idle(status); - - let json_output = serde_json::to_string(&results) - .expect("json serialization should not fail"); - Ok(json_output) - } + Ok(Ok(status)) => Ok(status), // query failed or was cancelled. Ok(Err(error)) => { let db_error = match &error { @@ -743,14 +723,29 @@ impl QueryData { discard.discard(); } - Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); } Err(_timeout) => { discard.discard(); - Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); } } } + }; + + match batch_result { + // The query successfully completed. + Ok(status) => { + discard.check_idle(status); + + let json_output = String::from_utf8(json_buf).expect("json should be valid utf8"); + Ok(json_output) + } + // The query failed with an error + Err(e) => { + discard.discard(); + Err(e) + } } } } @@ -787,7 +782,7 @@ impl BatchQueryData { }) .map_err(SqlOverHttpError::Postgres)?; - let json_output = match query_batch( + let json_output = match query_batch_to_json( config, cancel.child_token(), &mut transaction, @@ -845,24 +840,21 @@ async fn query_batch( transaction: &mut Transaction<'_>, queries: BatchQueryData, parsed_headers: HttpHeaders, -) -> Result { - let mut results = Vec::with_capacity(queries.queries.len()); - let mut current_size = 0; + results: &mut json::ListSer<'_>, +) -> Result<(), SqlOverHttpError> { for stmt in queries.queries { let query = pin!(query_to_json( config, transaction, stmt, - &mut current_size, + results.entry(), parsed_headers, )); let cancelled = pin!(cancel.cancelled()); let res = select(query, cancelled).await; match res { // TODO: maybe we should check that the transaction bit is set here - Either::Left((Ok((_, values)), _cancelled)) => { - results.push(values); - } + Either::Left((Ok(_), _cancelled)) => {} Either::Left((Err(e), _cancelled)) => { return Err(e); } @@ -872,8 +864,22 @@ async fn query_batch( } } - let results = json!({ "results": results }); - let json_output = serde_json::to_string(&results).expect("json serialization should not fail"); + Ok(()) +} + +async fn query_batch_to_json( + config: &'static HttpConfig, + cancel: CancellationToken, + tx: &mut Transaction<'_>, + queries: BatchQueryData, + headers: HttpHeaders, +) -> Result { + let json_output = json::value_to_string!(|obj| json::value_as_object!(|obj| { + let results = obj.key("results"); + json::value_as_list!(|results| { + query_batch(config, cancel, tx, queries, headers, results).await?; + }); + })); Ok(json_output) } @@ -882,54 +888,54 @@ async fn query_to_json( config: &'static HttpConfig, client: &mut T, data: QueryData, - current_size: &mut usize, + output: json::ValueSer<'_>, parsed_headers: HttpHeaders, -) -> Result<(ReadyForQueryStatus, impl Serialize + use), SqlOverHttpError> { +) -> Result { let query_start = Instant::now(); - let query_params = data.params; + let mut output = json::ObjectSer::new(output); let mut row_stream = client - .query_raw_txt(&data.query, query_params) + .query_raw_txt(&data.query, data.params) .await .map_err(SqlOverHttpError::Postgres)?; let query_acknowledged = Instant::now(); - let columns_len = row_stream.statement.columns().len(); - let mut fields = Vec::with_capacity(columns_len); - + let mut json_fields = output.key("fields").list(); for c in row_stream.statement.columns() { - fields.push(json!({ - "name": c.name().to_owned(), - "dataTypeID": c.type_().oid(), - "tableID": c.table_oid(), - "columnID": c.column_id(), - "dataTypeSize": c.type_size(), - "dataTypeModifier": c.type_modifier(), - "format": "text", - })); + let json_field = json_fields.entry(); + json::value_as_object!(|json_field| { + json_field.entry("name", c.name()); + json_field.entry("dataTypeID", c.type_().oid()); + json_field.entry("tableID", c.table_oid()); + json_field.entry("columnID", c.column_id()); + json_field.entry("dataTypeSize", c.type_size()); + json_field.entry("dataTypeModifier", c.type_modifier()); + json_field.entry("format", "text"); + }); } + json_fields.finish(); - let raw_output = parsed_headers.raw_output; let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); + let raw_output = parsed_headers.raw_output; // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too // big. - let mut rows = Vec::new(); + let mut rows = 0; + let mut json_rows = output.key("rows").list(); while let Some(row) = row_stream.next().await { let row = row.map_err(SqlOverHttpError::Postgres)?; - *current_size += row.body_len(); // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) - if *current_size > config.max_response_size_bytes { + if json_rows.as_buffer().len() > config.max_response_size_bytes { return Err(SqlOverHttpError::ResponseTooLarge( config.max_response_size_bytes, )); } - let row = pg_text_row_to_json(&row, raw_output, array_mode)?; - rows.push(row); + pg_text_row_to_json(json_rows.entry(), &row, raw_output, array_mode)?; + rows += 1; // assumption: parsing pg text and converting to json takes CPU time. // let's assume it is slightly expensive, so we should consume some cooperative budget. @@ -937,16 +943,14 @@ async fn query_to_json( // of rows and never hit the tokio mpsc for a long time (although unlikely). tokio::task::consume_budget().await; } + json_rows.finish(); let query_resp_end = Instant::now(); - let RowStream { - command_tag, - status: ready, - .. - } = row_stream; + + let ready = row_stream.status; // grab the command tag and number of rows affected - let command_tag = command_tag.unwrap_or_default(); + let command_tag = row_stream.command_tag.unwrap_or_default(); let mut command_tag_split = command_tag.split(' '); let command_tag_name = command_tag_split.next().unwrap_or_default(); let command_tag_count = if command_tag_name == "INSERT" { @@ -959,7 +963,7 @@ async fn query_to_json( .and_then(|s| s.parse::().ok()); info!( - rows = rows.len(), + rows, ?ready, command_tag, acknowledgement = ?(query_acknowledged - query_start), @@ -967,16 +971,12 @@ async fn query_to_json( "finished executing query" ); - // Resulting JSON format is based on the format of node-postgres result. - let results = json!({ - "command": command_tag_name.to_string(), - "rowCount": command_tag_count, - "rows": rows, - "fields": fields, - "rowAsArray": array_mode, - }); + output.entry("command", command_tag_name); + output.entry("rowCount", command_tag_count); + output.entry("rowAsArray", array_mode); - Ok((ready, results)) + output.finish(); + Ok(ready) } enum Client { diff --git a/proxy/src/util.rs b/proxy/src/util.rs index 7fc2d9fbdb..0291216d94 100644 --- a/proxy/src/util.rs +++ b/proxy/src/util.rs @@ -7,8 +7,16 @@ pub async fn run_until_cancelled( f: F, cancellation_token: &CancellationToken, ) -> Option { - match select(pin!(f), pin!(cancellation_token.cancelled())).await { - Either::Left((f, _)) => Some(f), - Either::Right(((), _)) => None, + run_until(f, cancellation_token.cancelled()).await.ok() +} + +/// Runs the future `f` unless interrupted by future `condition`. +pub async fn run_until( + f: F1, + condition: F2, +) -> Result { + match select(pin!(f), pin!(condition)).await { + Either::Left((f1, _)) => Ok(f1), + Either::Right((f2, _)) => Err(f2), } } diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index b4bb193a4b..3c8db3029e 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -6,10 +6,10 @@ use std::error::Error as _; use http_utils::error::HttpErrorBody; -use reqwest::{IntoUrl, Method, StatusCode}; +use reqwest::{IntoUrl, Method, Response, StatusCode}; use safekeeper_api::models::{ self, PullTimelineRequest, PullTimelineResponse, SafekeeperStatus, SafekeeperUtilization, - TimelineCreateRequest, TimelineStatus, + TimelineCreateRequest, }; use utils::id::{NodeId, TenantId, TimelineId}; use utils::logging::SecretString; @@ -161,13 +161,12 @@ impl Client { &self, tenant_id: TenantId, timeline_id: TimelineId, - ) -> Result { + ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}", self.mgmt_api_endpoint, tenant_id, timeline_id ); - let resp = self.get(&uri).await?; - resp.json().await.map_err(Error::ReceiveBody) + self.get(&uri).await } pub async fn snapshot( diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 8fda625817..79cf2f9149 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -23,6 +23,7 @@ use safekeeper::defaults::{ DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE, }; +use safekeeper::hadron; use safekeeper::wal_backup::WalBackup; use safekeeper::{ BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, @@ -37,6 +38,7 @@ use tracing::*; use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; use utils::id::NodeId; use utils::logging::{self, LogFormat, SecretString}; +use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR}; use utils::sentry_init::init_sentry; use utils::{pid_file, project_build_tag, project_git_version, tcp_listener}; @@ -243,9 +245,18 @@ struct Args { #[arg(long)] enable_tls_wal_service_api: bool, + /// Controls whether to collect all metrics on each scrape or to return potentially stale + /// results. + #[arg(long, default_value_t = true)] + force_metric_collection_on_scrape: bool, + /// Run in development mode (disables security checks) #[arg(long, help = "Run in development mode (disables security checks)")] dev: bool, + /* BEGIN_HADRON */ + #[arg(long)] + enable_pull_timeline_on_startup: bool, + /* END_HADRON */ } // Like PathBufValueParser, but allows empty string. @@ -428,6 +439,12 @@ async fn main() -> anyhow::Result<()> { ssl_ca_certs, use_https_safekeeper_api: args.use_https_safekeeper_api, enable_tls_wal_service_api: args.enable_tls_wal_service_api, + force_metric_collection_on_scrape: args.force_metric_collection_on_scrape, + /* BEGIN_HADRON */ + advertise_pg_addr_tenant_only: None, + enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup, + hcc_base_url: None, + /* END_HADRON */ }); // initialize sentry if SENTRY_DSN is provided @@ -522,6 +539,20 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { // Load all timelines from disk to memory. global_timelines.init().await?; + /* BEGIN_HADRON */ + if conf.enable_pull_timeline_on_startup && global_timelines.timelines_count() == 0 { + match hadron::hcc_pull_timelines(&conf, global_timelines.clone()).await { + Ok(_) => { + info!("Successfully pulled all timelines from peer safekeepers"); + } + Err(e) => { + error!("Failed to pull timelines from peer safekeepers: {:?}", e); + return Err(e); + } + } + } + /* END_HADRON */ + // Run everything in current thread rt, if asked. if conf.current_thread_runtime { info!("running in current thread runtime"); @@ -640,6 +671,26 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { .map(|res| ("broker main".to_owned(), res)); tasks_handles.push(Box::pin(broker_task_handle)); + /* BEGIN_HADRON */ + if conf.force_metric_collection_on_scrape { + let metrics_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| BACKGROUND_RUNTIME.handle()) + .spawn(async move { + let mut interval: tokio::time::Interval = + tokio::time::interval(METRICS_COLLECTION_INTERVAL); + loop { + interval.tick().await; + tokio::task::spawn_blocking(|| { + METRICS_COLLECTOR.run_once(true); + }); + } + }) + .map(|res| ("broker main".to_owned(), res)); + tasks_handles.push(Box::pin(metrics_handle)); + } + /* END_HADRON */ + set_build_info_metric(GIT_VERSION, BUILD_TAG); // TODO: update tokio-stream, convert to real async Stream with diff --git a/safekeeper/src/hadron.rs b/safekeeper/src/hadron.rs new file mode 100644 index 0000000000..b41bf2c3da --- /dev/null +++ b/safekeeper/src/hadron.rs @@ -0,0 +1,388 @@ +use pem::Pem; +use safekeeper_api::models::PullTimelineRequest; +use std::{collections::HashMap, env::VarError, net::IpAddr, sync::Arc, time::Duration}; +use tokio::time::sleep; +use tokio_util::sync::CancellationToken; +use url::Url; +use utils::{backoff, id::TenantTimelineId, ip_address}; + +use anyhow::Result; +use pageserver_api::controller_api::{ + AvailabilityZone, NodeRegisterRequest, SafekeeperTimeline, SafekeeperTimelinesResponse, +}; + +use crate::{ + GlobalTimelines, SafeKeeperConf, + metrics::{ + SK_RECOVERY_PULL_TIMELINE_ERRORS, SK_RECOVERY_PULL_TIMELINE_OKS, + SK_RECOVERY_PULL_TIMELINE_SECONDS, SK_RECOVERY_PULL_TIMELINES_SECONDS, + }, + pull_timeline, + timelines_global_map::DeleteOrExclude, +}; + +// Extract information in the SafeKeeperConf to build a NodeRegisterRequest used to register the safekeeper with the HCC. +fn build_node_registeration_request( + conf: &SafeKeeperConf, + node_ip_addr: Option, +) -> Result { + let advertise_pg_addr_with_port = conf + .advertise_pg_addr_tenant_only + .as_deref() + .expect("advertise_pg_addr_tenant_only is required to register with HCC"); + + // Extract host/port from the string. + let (advertise_host_addr, pg_port_str) = advertise_pg_addr_with_port.split_at( + advertise_pg_addr_with_port + .rfind(':') + .ok_or(anyhow::anyhow!("Invalid advertise_pg_addr"))?, + ); + // Need the `[1..]` to remove the leading ':'. + let pg_port = pg_port_str[1..] + .parse::() + .map_err(|e| anyhow::anyhow!("Cannot parse PG port: {}", e))?; + + let (_, http_port_str) = conf.listen_http_addr.split_at( + conf.listen_http_addr + .rfind(':') + .ok_or(anyhow::anyhow!("Invalid listen_http_addr"))?, + ); + let http_port = http_port_str[1..] + .parse::() + .map_err(|e| anyhow::anyhow!("Cannot parse HTTP port: {}", e))?; + + Ok(NodeRegisterRequest { + node_id: conf.my_id, + listen_pg_addr: advertise_host_addr.to_string(), + listen_pg_port: pg_port, + listen_http_addr: advertise_host_addr.to_string(), + listen_http_port: http_port, + node_ip_addr, + availability_zone_id: AvailabilityZone("todo".to_string()), + listen_grpc_addr: None, + listen_grpc_port: None, + listen_https_port: None, + }) +} + +// Retrieve the JWT token used for authenticating with HCC from the environment variable. +// Returns None if the token cannot be retrieved. +fn get_hcc_auth_token() -> Option { + match std::env::var("HCC_AUTH_TOKEN") { + Ok(v) => { + tracing::info!("Loaded JWT token for authentication with HCC"); + Some(v) + } + Err(VarError::NotPresent) => { + tracing::info!("No JWT token for authentication with HCC detected"); + None + } + Err(_) => { + tracing::info!( + "Failed to either load to detect non-present HCC_AUTH_TOKEN environment variable" + ); + None + } + } +} + +async fn send_safekeeper_register_request( + request_url: &Url, + auth_token: &Option, + request: &NodeRegisterRequest, +) -> Result<()> { + let client = reqwest::Client::new(); + let mut req_builder = client + .post(request_url.clone()) + .header("Content-Type", "application/json"); + if let Some(token) = auth_token { + req_builder = req_builder.bearer_auth(token); + } + req_builder + .json(&request) + .send() + .await? + .error_for_status()?; + Ok(()) +} + +/// Registers this safe keeper with the HCC. +pub async fn register(conf: &SafeKeeperConf) -> Result<()> { + match conf.hcc_base_url.as_ref() { + None => { + tracing::info!("HCC base URL is not set, skipping registration"); + Ok(()) + } + Some(hcc_base_url) => { + // The following operations acquiring the auth token and the node IP address both read environment + // variables. It's fine for now as this `register()` function is only called once during startup. + // If we start to talk to HCC more regularly in the safekeeper we should probably consider + // refactoring things into a "HadronClusterCoordinatorClient" struct. + let auth_token = get_hcc_auth_token(); + let node_ip_addr = + ip_address::read_node_ip_addr_from_env().expect("Error reading node IP address."); + + let request = build_node_registeration_request(conf, node_ip_addr)?; + let cancel = CancellationToken::new(); + let request_url = hcc_base_url.clone().join("/hadron-internal/v1/sk")?; + + backoff::retry( + || async { + send_safekeeper_register_request(&request_url, &auth_token, &request).await + }, + |_| false, + 3, + u32::MAX, + "Calling the HCC safekeeper register API", + &cancel, + ) + .await + .ok_or(anyhow::anyhow!( + "Error in forever retry loop. This error should never be surfaced." + ))? + } + } +} + +async fn safekeeper_list_timelines_request( + conf: &SafeKeeperConf, +) -> Result { + if conf.hcc_base_url.is_none() { + tracing::info!("HCC base URL is not set, skipping registration"); + return Err(anyhow::anyhow!("HCC base URL is not set")); + } + + // The following operations acquiring the auth token and the node IP address both read environment + // variables. It's fine for now as this `register()` function is only called once during startup. + // If we start to talk to HCC more regularly in the safekeeper we should probably consider + // refactoring things into a "HadronClusterCoordinatorClient" struct. + let auth_token = get_hcc_auth_token(); + let method = format!("/control/v1/safekeeper/{}/timelines", conf.my_id.0); + let request_url = conf.hcc_base_url.as_ref().unwrap().clone().join(&method)?; + + let client = reqwest::Client::new(); + let mut req_builder = client + .get(request_url.clone()) + .header("Content-Type", "application/json") + .query(&[("id", conf.my_id.0)]); + if let Some(token) = auth_token { + req_builder = req_builder.bearer_auth(token); + } + let response = req_builder + .send() + .await? + .error_for_status()? + .json::() + .await?; + Ok(response) +} + +// Returns true on success, false otherwise. +pub async fn hcc_pull_timeline( + timeline: SafekeeperTimeline, + conf: &SafeKeeperConf, + global_timelines: Arc, + nodeid_http: &HashMap, +) -> bool { + let mut request = PullTimelineRequest { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + http_hosts: Vec::new(), + ignore_tombstone: None, + }; + for host in timeline.peers { + if host.0 == conf.my_id.0 { + continue; + } + if let Some(http_host) = nodeid_http.get(&host.0) { + request.http_hosts.push(http_host.clone()); + } + } + + let ca_certs = match conf + .ssl_ca_certs + .iter() + .map(Pem::contents) + .map(reqwest::Certificate::from_der) + .collect::, _>>() + { + Ok(result) => result, + Err(_) => { + return false; + } + }; + match pull_timeline::handle_request( + request, + conf.sk_auth_token.clone(), + ca_certs, + global_timelines.clone(), + true, + ) + .await + { + Ok(resp) => { + tracing::info!( + "Completed pulling tenant {} timeline {} from SK {:?}", + timeline.tenant_id, + timeline.timeline_id, + resp.safekeeper_host + ); + return true; + } + Err(e) => { + tracing::error!( + "Failed to pull tenant {} timeline {} from SK {}", + timeline.tenant_id, + timeline.timeline_id, + e + ); + + let ttid = TenantTimelineId { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + }; + // Revert the failed timeline pull. + // Notice that not found timeline returns OK also. + match global_timelines + .delete_or_exclude(&ttid, DeleteOrExclude::DeleteLocal) + .await + { + Ok(dr) => { + tracing::info!( + "Deleted tenant {} timeline {} DirExists: {}", + timeline.tenant_id, + timeline.timeline_id, + dr.dir_existed, + ); + } + Err(e) => { + tracing::error!( + "Failed to delete tenant {} timeline {} from global_timelines: {}", + timeline.tenant_id, + timeline.timeline_id, + e + ); + } + } + } + } + false +} + +pub async fn hcc_pull_timeline_till_success( + timeline: SafekeeperTimeline, + conf: &SafeKeeperConf, + global_timelines: Arc, + nodeid_http: &HashMap, +) { + const MAX_PULL_TIMELINE_RETRIES: u64 = 100; + for i in 0..MAX_PULL_TIMELINE_RETRIES { + if hcc_pull_timeline( + timeline.clone(), + conf, + global_timelines.clone(), + nodeid_http, + ) + .await + { + SK_RECOVERY_PULL_TIMELINE_OKS.inc(); + return; + } + tracing::error!( + "Failed to pull timeline {} from SK peers, retrying {}/{}", + timeline.timeline_id, + i + 1, + MAX_PULL_TIMELINE_RETRIES + ); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + } + SK_RECOVERY_PULL_TIMELINE_ERRORS.inc(); +} + +pub async fn hcc_pull_timelines( + conf: &SafeKeeperConf, + global_timelines: Arc, +) -> Result<()> { + let _timer = SK_RECOVERY_PULL_TIMELINES_SECONDS.start_timer(); + tracing::info!("Start pulling timelines from SK peers"); + + let mut response = SafekeeperTimelinesResponse { + timelines: Vec::new(), + safekeeper_peers: Vec::new(), + }; + for i in 0..100 { + match safekeeper_list_timelines_request(conf).await { + Ok(timelines) => { + response = timelines; + } + Err(e) => { + tracing::error!("Failed to list timelines from HCC: {}", e); + if i == 99 { + return Err(e); + } + } + } + sleep(Duration::from_millis(100)).await; + } + + let mut nodeid_http = HashMap::new(); + for sk in response.safekeeper_peers { + nodeid_http.insert( + sk.node_id.0, + format!("http://{}:{}", sk.listen_http_addr, sk.http_port), + ); + } + tracing::info!("Received {} timelines from HCC", response.timelines.len()); + for timeline in response.timelines { + let _timer = SK_RECOVERY_PULL_TIMELINE_SECONDS + .with_label_values(&[ + &timeline.tenant_id.to_string(), + &timeline.timeline_id.to_string(), + ]) + .start_timer(); + hcc_pull_timeline_till_success(timeline, conf, global_timelines.clone(), &nodeid_http) + .await; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use utils::id::NodeId; + + #[test] + fn test_build_node_registeration_request() { + // Test that: + // 1. We always extract the host name and port used to register with the HCC from the + // `advertise_pg_addr` if it is set. + // 2. The correct ports are extracted from `advertise_pg_addr` and `listen_http_addr`. + let mut conf = SafeKeeperConf::dummy(); + conf.my_id = NodeId(1); + conf.advertise_pg_addr_tenant_only = + Some("safe-keeper-1.safe-keeper.hadron.svc.cluster.local:5454".to_string()); + // `listen_pg_addr` and `listen_pg_addr_tenant_only` are not used for node registration. Set them to a different + // host and port values and make sure that they don't show up in the node registration request. + conf.listen_pg_addr = "0.0.0.0:5456".to_string(); + conf.listen_pg_addr_tenant_only = Some("0.0.0.0:5456".to_string()); + conf.listen_http_addr = "0.0.0.0:7676".to_string(); + let node_ip_addr: Option = Some("127.0.0.1".parse().unwrap()); + + let request = build_node_registeration_request(&conf, node_ip_addr).unwrap(); + assert_eq!(request.node_id, NodeId(1)); + assert_eq!( + request.listen_pg_addr, + "safe-keeper-1.safe-keeper.hadron.svc.cluster.local" + ); + assert_eq!(request.listen_pg_port, 5454); + assert_eq!( + request.listen_http_addr, + "safe-keeper-1.safe-keeper.hadron.svc.cluster.local" + ); + assert_eq!(request.listen_http_port, 7676); + assert_eq!( + request.node_ip_addr, + Some(IpAddr::V4("127.0.0.1".parse().unwrap())) + ); + } +} diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 384c582678..a0ee2facb5 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -241,9 +241,14 @@ async fn timeline_pull_handler(mut request: Request) -> Result, pub availability_zone: Option, pub no_sync: bool, + /* BEGIN_HADRON */ + pub advertise_pg_addr_tenant_only: Option, + pub enable_pull_timeline_on_startup: bool, + pub hcc_base_url: Option, + /* END_HADRON */ pub broker_endpoint: Uri, pub broker_keepalive_interval: Duration, pub heartbeat_timeout: Duration, @@ -134,6 +141,7 @@ pub struct SafeKeeperConf { pub ssl_ca_certs: Vec, pub use_https_safekeeper_api: bool, pub enable_tls_wal_service_api: bool, + pub force_metric_collection_on_scrape: bool, } impl SafeKeeperConf { @@ -183,6 +191,12 @@ impl SafeKeeperConf { ssl_ca_certs: Vec::new(), use_https_safekeeper_api: false, enable_tls_wal_service_api: false, + force_metric_collection_on_scrape: true, + /* BEGIN_HADRON */ + advertise_pg_addr_tenant_only: None, + enable_pull_timeline_on_startup: false, + hcc_base_url: None, + /* END_HADRON */ } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 9baa80f73a..e1af51c115 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -59,6 +59,15 @@ pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { .expect("Failed to register safekeeper_flush_wal_seconds histogram") }); /* BEGIN_HADRON */ +// Counter of all ProposerAcceptorMessage requests received +pub static PROPOSER_ACCEPTOR_MESSAGES_TOTAL: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_proposer_acceptor_messages_total", + "Total number of ProposerAcceptorMessage requests received by the Safekeeper.", + &["outcome"] + ) + .expect("Failed to register safekeeper_proposer_acceptor_messages_total counter") +}); pub static WAL_DISK_IO_ERRORS: Lazy = Lazy::new(|| { register_int_counter!( "safekeeper_wal_disk_io_errors", @@ -76,6 +85,43 @@ pub static WAL_STORAGE_LIMIT_ERRORS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_wal_storage_limit_errors counter") }); +pub static SK_RECOVERY_PULL_TIMELINE_ERRORS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_recovery_pull_timeline_errors", + concat!( + "Number of errors due to pull_timeline errors during SK lost disk recovery.", + "An increase in this metric indicates pull timelines runs into error." + ) + ) + .expect("Failed to register safekeeper_recovery_pull_timeline_errors counter") +}); +pub static SK_RECOVERY_PULL_TIMELINE_OKS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_recovery_pull_timeline_oks", + concat!( + "Number of successful pull_timeline during SK lost disk recovery.", + "An increase in this metric indicates pull timelines is successful." + ) + ) + .expect("Failed to register safekeeper_recovery_pull_timeline_oks counter") +}); +pub static SK_RECOVERY_PULL_TIMELINES_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_recovery_pull_timelines_seconds", + "Seconds to pull timelines", + DISK_FSYNC_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_recovery_pull_timelines_seconds histogram") +}); +pub static SK_RECOVERY_PULL_TIMELINE_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "safekeeper_recovery_pull_timeline_seconds", + "Seconds to pull timeline", + &["tenant_id", "timeline_id"], + DISK_FSYNC_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_recovery_pull_timeline_seconds histogram vec") +}); /* END_HADRON */ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { register_histogram!( diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 1c9e5bade5..b4c4877b2c 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -8,6 +8,7 @@ use bytes::Bytes; use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; +use http::StatusCode; use http_utils::error::ApiError; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; use remote_storage::GenericRemoteStorage; @@ -21,10 +22,11 @@ use tokio::fs::OpenOptions; use tokio::io::AsyncWrite; use tokio::sync::mpsc; use tokio::task; +use tokio::time::sleep; use tokio_tar::{Archive, Builder, Header}; use tokio_util::io::{CopyToBytes, SinkWriter}; use tokio_util::sync::PollSender; -use tracing::{error, info, instrument}; +use tracing::{error, info, instrument, warn}; use utils::crashsafe::fsync_async_opt; use utils::id::{NodeId, TenantTimelineId}; use utils::logging::SecretString; @@ -449,6 +451,7 @@ pub async fn handle_request( sk_auth_token: Option, ssl_ca_certs: Vec, global_timelines: Arc, + wait_for_peer_timeline_status: bool, ) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( request.tenant_id, @@ -472,37 +475,100 @@ pub async fn handle_request( let http_hosts = request.http_hosts.clone(); // Figure out statuses of potential donors. - let responses: Vec> = - futures::future::join_all(http_hosts.iter().map(|url| async { - let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone()); - let info = cclient - .timeline_status(request.tenant_id, request.timeline_id) - .await?; - Ok(info) - })) - .await; - let mut statuses = Vec::new(); - for (i, response) in responses.into_iter().enumerate() { - match response { - Ok(status) => { - statuses.push((status, i)); - } - Err(e) => { - info!("error fetching status from {}: {e}", http_hosts[i]); + if !wait_for_peer_timeline_status { + let responses: Vec> = + futures::future::join_all(http_hosts.iter().map(|url| async { + let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone()); + let resp = cclient + .timeline_status(request.tenant_id, request.timeline_id) + .await?; + let info: TimelineStatus = resp + .json() + .await + .context("Failed to deserialize timeline status") + .map_err(|e| mgmt_api::Error::ReceiveErrorBody(e.to_string()))?; + Ok(info) + })) + .await; + + for (i, response) in responses.into_iter().enumerate() { + match response { + Ok(status) => { + statuses.push((status, i)); + } + Err(e) => { + info!("error fetching status from {}: {e}", http_hosts[i]); + } } } - } - // Allow missing responses from up to one safekeeper (say due to downtime) - // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes - // offline and C comes online. Then we want a pull on C with A and B as hosts to work. - let min_required_successful = (http_hosts.len() - 1).max(1); - if statuses.len() < min_required_successful { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "only got {} successful status responses. required: {min_required_successful}", - statuses.len() - ))); + // Allow missing responses from up to one safekeeper (say due to downtime) + // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes + // offline and C comes online. Then we want a pull on C with A and B as hosts to work. + let min_required_successful = (http_hosts.len() - 1).max(1); + if statuses.len() < min_required_successful { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "only got {} successful status responses. required: {min_required_successful}", + statuses.len() + ))); + } + } else { + let mut retry = true; + // We must get status from all other peers. + // Otherwise, we may run into split-brain scenario. + while retry { + statuses.clear(); + retry = false; + for (i, url) in http_hosts.iter().enumerate() { + let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone()); + match cclient + .timeline_status(request.tenant_id, request.timeline_id) + .await + { + Ok(resp) => { + if resp.status() == StatusCode::NOT_FOUND { + warn!( + "Timeline {} not found on peer SK {}, no need to pull it", + TenantTimelineId::new(request.tenant_id, request.timeline_id), + url + ); + return Ok(PullTimelineResponse { + safekeeper_host: None, + }); + } + let info: TimelineStatus = resp + .json() + .await + .context("Failed to deserialize timeline status") + .map_err(ApiError::InternalServerError)?; + statuses.push((info, i)); + } + Err(e) => { + match e { + // If we get a 404, it means the timeline doesn't exist on this safekeeper. + // We can ignore this error. + mgmt_api::Error::ApiError(status, _) + if status == StatusCode::NOT_FOUND => + { + warn!( + "Timeline {} not found on peer SK {}, no need to pull it", + TenantTimelineId::new(request.tenant_id, request.timeline_id), + url + ); + return Ok(PullTimelineResponse { + safekeeper_host: None, + }); + } + _ => {} + } + retry = true; + error!("Failed to get timeline status from {}: {:#}", url, e); + } + } + } + sleep(std::time::Duration::from_millis(100)).await; + } } // Find the most advanced safekeeper @@ -511,6 +577,12 @@ pub async fn handle_request( .max_by_key(|(status, _)| { ( status.acceptor_state.epoch, + /* BEGIN_HADRON */ + // We need to pull from the SK with the highest term. + // This is because another compute may come online and vote the same highest term again on the other two SKs. + // Then, there will be 2 computes running on the same term. + status.acceptor_state.term, + /* END_HADRON */ status.flush_lsn, status.commit_lsn, ) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 4d15fc9de3..09ca041e22 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -24,7 +24,7 @@ use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; -use crate::metrics::MISC_OPERATION_SECONDS; +use crate::metrics::{MISC_OPERATION_SECONDS, PROPOSER_ACCEPTOR_MESSAGES_TOTAL}; use crate::state::TimelineState; use crate::{control_file, wal_storage}; @@ -938,7 +938,7 @@ where &mut self, msg: &ProposerAcceptorMessage, ) -> Result> { - match msg { + let res = match msg { ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg).await, ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg).await, ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg).await, @@ -949,7 +949,20 @@ where self.handle_append_request(msg, false).await } ProposerAcceptorMessage::FlushWAL => self.handle_flush().await, - } + }; + + // BEGIN HADRON + match &res { + Ok(_) => PROPOSER_ACCEPTOR_MESSAGES_TOTAL + .with_label_values(&["success"]) + .inc(), + Err(_) => PROPOSER_ACCEPTOR_MESSAGES_TOTAL + .with_label_values(&["error"]) + .inc(), + }; + + res + // END HADRON } /// Handle initial message from proposer: check its sanity and send my diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 7e10847a1b..0e8dfd64c3 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -166,7 +166,7 @@ fn hadron_determine_offloader(mgr: &Manager, state: &StateSnapshot) -> (Option) -> Result<()> { ssl_ca_certs: Vec::new(), use_https_safekeeper_api: false, enable_tls_wal_service_api: false, + force_metric_collection_on_scrape: true, + /* BEGIN_HADRON */ + enable_pull_timeline_on_startup: false, + advertise_pg_addr_tenant_only: None, + hcc_base_url: None, + /* END_HADRON */ }; let mut global = GlobalMap::new(disk, conf.clone())?; diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index e5a3a969d4..62fc212e12 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -850,6 +850,31 @@ async fn handle_tenant_describe( json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) } +/* BEGIN_HADRON */ +async fn handle_tenant_timeline_describe( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Scrubber)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + + json_response( + StatusCode::OK, + service + .tenant_timeline_describe(tenant_id, timeline_id) + .await?, + ) +} +/* END_HADRON */ + async fn handle_tenant_list( service: Arc, req: Request, @@ -2480,6 +2505,13 @@ pub fn make_router( ) }) // Timeline operations + .get("/control/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + tenant_service_handler( + r, + handle_tenant_timeline_describe, + RequestName("v1_tenant_timeline_describe"), + ) + }) .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { tenant_service_handler( r, diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 2a851dc25b..5d21feeb10 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -222,6 +222,9 @@ struct Cli { /// Primarily useful for testing to reduce test execution time. #[arg(long, default_value = "false", action=ArgAction::Set)] kick_secondary_downloads: bool, + + #[arg(long)] + shard_split_request_timeout: Option, } enum StrictMode { @@ -470,6 +473,10 @@ async fn async_main() -> anyhow::Result<()> { timeline_safekeeper_count: args.timeline_safekeeper_count, posthog_config: posthog_config.clone(), kick_secondary_downloads: args.kick_secondary_downloads, + shard_split_request_timeout: args + .shard_split_request_timeout + .map(humantime::Duration::into) + .unwrap_or(Duration::MAX), }; // Validate that we can connect to the database diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index d6fe173eb3..da0687895a 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -86,6 +86,23 @@ impl PageserverClient { ) } + /* BEGIN_HADRON */ + pub(crate) async fn tenant_timeline_describe( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + ) -> Result { + measured_request!( + "tenant_timeline_describe", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner + .tenant_timeline_describe(tenant_shard_id, timeline_id,) + .await + ) + } + /* END_HADRON */ + pub(crate) async fn tenant_scan_remote_storage( &self, tenant_id: TenantId, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index ed6643d641..638cb410fa 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -32,7 +32,7 @@ use pageserver_api::controller_api::{ ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, - TenantShardMigrateRequest, TenantShardMigrateResponse, + TenantShardMigrateRequest, TenantShardMigrateResponse, TenantTimelineDescribeResponse, }; use pageserver_api::models::{ self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, @@ -60,6 +60,7 @@ use tokio::sync::mpsc::error::TrySendError; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; use utils::completion::Barrier; +use utils::env; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; @@ -483,6 +484,9 @@ pub struct Config { /// When set, actively checks and initiates heatmap downloads/uploads. pub kick_secondary_downloads: bool, + + /// Timeout used for HTTP client of split requests. [`Duration::MAX`] if None. + pub shard_split_request_timeout: Duration, } impl From for ApiError { @@ -1984,11 +1988,14 @@ impl Service { }); // Check that there is enough safekeepers configured that we can create new timelines - let test_sk_res = this.safekeepers_for_new_timeline().await; + let test_sk_res_str = match this.safekeepers_for_new_timeline().await { + Ok(v) => format!("Ok({v:?})"), + Err(v) => format!("Err({v:})"), + }; tracing::info!( timeline_safekeeper_count = config.timeline_safekeeper_count, timelines_onto_safekeepers = config.timelines_onto_safekeepers, - "viability test result (test timeline creation on safekeepers): {test_sk_res:?}", + "viability test result (test timeline creation on safekeepers): {test_sk_res_str}", ); Ok(this) @@ -4428,7 +4435,7 @@ impl Service { .await; let mut failed = 0; - for (tid, result) in targeted_tenant_shards.iter().zip(results.into_iter()) { + for (tid, (_, result)) in targeted_tenant_shards.iter().zip(results.into_iter()) { match result { Ok(ok) => { if tid.is_shard_zero() { @@ -4758,6 +4765,7 @@ impl Service { ) .await; + let mut retry_if_not_attached = false; let targets = { let locked = self.inner.read().unwrap(); let mut targets = Vec::new(); @@ -4774,6 +4782,24 @@ impl Service { .expect("Pageservers may not be deleted while referenced"); targets.push((*tenant_shard_id, node.clone())); + + if let Some(location) = shard.observed.locations.get(node_id) { + if let Some(ref conf) = location.conf { + if conf.mode != LocationConfigMode::AttachedSingle + && conf.mode != LocationConfigMode::AttachedMulti + { + // If the shard is attached as secondary, we need to retry if 404. + retry_if_not_attached = true; + } + // If the shard is attached as primary, we should succeed. + } else { + // Location conf is not available yet, retry if 404. + retry_if_not_attached = true; + } + } else { + // The shard is not attached to the intended pageserver yet, retry if 404. + retry_if_not_attached = true; + } } } targets @@ -4795,7 +4821,7 @@ impl Service { .await; let mut valid_until = None; - for r in res { + for (node, r) in res { match r { Ok(lease) => { if let Some(ref mut valid_until) = valid_until { @@ -4804,8 +4830,20 @@ impl Service { valid_until = Some(lease.valid_until); } } + Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) + if retry_if_not_attached => + { + // This is expected if the attach is not finished yet. Return 503 so that the client can retry. + return Err(ApiError::ResourceUnavailable( + format!( + "Timeline is not attached to the pageserver {} yet, please retry", + node.get_id() + ) + .into(), + )); + } Err(e) => { - return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); + return Err(passthrough_api_error(&node, e)); } } } @@ -4919,7 +4957,7 @@ impl Service { max_retries: u32, timeout: Duration, cancel: &CancellationToken, - ) -> Vec> + ) -> Vec<(Node, mgmt_api::Result)> where O: Fn(TenantShardId, PageserverClient) -> F + Copy, F: std::future::Future>, @@ -4940,16 +4978,16 @@ impl Service { cancel, ) .await; - (idx, r) + (idx, node, r) }); } - while let Some((idx, r)) = futs.next().await { - results.push((idx, r.unwrap_or(Err(mgmt_api::Error::Cancelled)))); + while let Some((idx, node, r)) = futs.next().await { + results.push((idx, node, r.unwrap_or(Err(mgmt_api::Error::Cancelled)))); } - results.sort_by_key(|(idx, _)| *idx); - results.into_iter().map(|(_, r)| r).collect() + results.sort_by_key(|(idx, _, _)| *idx); + results.into_iter().map(|(_, node, r)| (node, r)).collect() } /// Helper for safely working with the shards in a tenant remotely on pageservers, for example @@ -5172,6 +5210,9 @@ impl Service { match res { Ok(ok) => Ok(ok), Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT), + Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) if msg.contains("Requested tenant is missing") => { + Err(ApiError::ResourceUnavailable("Tenant migration in progress".into())) + }, Err(mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg)) => Err(ApiError::ResourceUnavailable(msg.into())), Err(e) => { Err( @@ -5452,6 +5493,92 @@ impl Service { .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into())) } + /* BEGIN_HADRON */ + pub(crate) async fn tenant_timeline_describe( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + self.tenant_remote_mutation(tenant_id, |locations| async move { + if locations.0.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + }; + + let locations: Vec<(TenantShardId, Node)> = locations + .0 + .iter() + .map(|t| (*t.0, t.1.latest.node.clone())) + .collect(); + let mut futs = FuturesUnordered::new(); + + for (shard_id, node) in locations { + futs.push({ + async move { + let result = node + .with_client_retries( + |client| async move { + client + .tenant_timeline_describe(&shard_id, &timeline_id) + .await + }, + &self.http_client, + &self.config.pageserver_jwt_token, + 3, + 3, + Duration::from_secs(30), + &self.cancel, + ) + .await; + (result, shard_id, node.get_id()) + } + }); + } + + let mut results: Vec = Vec::new(); + while let Some((result, tenant_shard_id, node_id)) = futs.next().await { + match result { + Some(Ok(timeline_info)) => results.push(timeline_info), + Some(Err(e)) => { + tracing::warn!( + "Failed to describe tenant {} timeline {} for pageserver {}: {e}", + tenant_shard_id, + timeline_id, + node_id, + ); + return Err(ApiError::ResourceUnavailable(format!("{e}").into())); + } + None => return Err(ApiError::Cancelled), + } + } + let mut image_consistent_lsn: Option = Some(Lsn::MAX); + for timeline_info in &results { + if let Some(tline_image_consistent_lsn) = timeline_info.image_consistent_lsn { + image_consistent_lsn = Some(std::cmp::min( + image_consistent_lsn.unwrap(), + tline_image_consistent_lsn, + )); + } else { + tracing::warn!( + "Timeline {} on shard {} does not have image consistent lsn", + timeline_info.timeline_id, + timeline_info.tenant_id + ); + image_consistent_lsn = None; + break; + } + } + + Ok(TenantTimelineDescribeResponse { + shards: results, + image_consistent_lsn, + }) + }) + .await? + } + /* END_HADRON */ + /// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not /// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory /// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses @@ -5862,7 +5989,7 @@ impl Service { return; } - for result in self + for (_, result) in self .tenant_for_shards_api( attached, |tenant_shard_id, client| async move { @@ -5881,7 +6008,7 @@ impl Service { } } - for result in self + for (_, result) in self .tenant_for_shards_api( secondary, |tenant_shard_id, client| async move { @@ -6283,18 +6410,39 @@ impl Service { // TODO: issue split calls concurrently (this only matters once we're splitting // N>1 shards into M shards -- initially we're usually splitting 1 shard into N). + // HADRON: set a timeout for splitting individual shards on page servers. + // Currently we do not perform any retry because it's not clear if page server can handle + // partially split shards correctly. + let shard_split_timeout = + if let Some(env::DeploymentMode::Local) = env::get_deployment_mode() { + Duration::from_secs(30) + } else { + self.config.shard_split_request_timeout + }; + let mut http_client_builder = reqwest::ClientBuilder::new() + .pool_max_idle_per_host(0) + .timeout(shard_split_timeout); + + for ssl_ca_cert in &self.config.ssl_ca_certs { + http_client_builder = http_client_builder.add_root_certificate(ssl_ca_cert.clone()); + } + let http_client = http_client_builder + .build() + .expect("Failed to construct HTTP client"); for target in &targets { let ShardSplitTarget { parent_id, node, child_ids, } = target; + let client = PageserverClient::new( node.get_id(), - self.http_client.clone(), + http_client.clone(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), ); + let response = client .tenant_shard_split( *parent_id, @@ -8768,7 +8916,7 @@ impl Service { ) .await; - for ((tenant_shard_id, node, optimization), secondary_status) in + for ((tenant_shard_id, node, optimization), (_, secondary_status)) in want_secondary_status.into_iter().zip(results.into_iter()) { match secondary_status { diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index d7179372b2..7521d7bd86 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -25,7 +25,8 @@ use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::PgVersionId; use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration}; use safekeeper_api::models::{ - PullTimelineRequest, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse, + PullTimelineRequest, TimelineLocateResponse, TimelineMembershipSwitchRequest, + TimelineMembershipSwitchResponse, }; use safekeeper_api::{INITIAL_TERM, Term}; use safekeeper_client::mgmt_api; @@ -37,21 +38,14 @@ use utils::lsn::Lsn; use super::Service; -#[derive(serde::Serialize, serde::Deserialize, Clone)] -pub struct TimelineLocateResponse { - pub generation: SafekeeperGeneration, - pub sk_set: Vec, - pub new_sk_set: Option>, -} - impl Service { - fn make_member_set(safekeepers: &[Safekeeper]) -> Result { + fn make_member_set(safekeepers: &[Safekeeper]) -> Result { let members = safekeepers .iter() .map(|sk| sk.get_safekeeper_id()) .collect::>(); - MemberSet::new(members).map_err(ApiError::InternalServerError) + MemberSet::new(members) } fn get_safekeepers(&self, ids: &[i64]) -> Result, ApiError> { @@ -86,7 +80,7 @@ impl Service { ) -> Result, ApiError> { let safekeepers = self.get_safekeepers(&timeline_persistence.sk_set)?; - let mset = Self::make_member_set(&safekeepers)?; + let mset = Self::make_member_set(&safekeepers).map_err(ApiError::InternalServerError)?; let mconf = safekeeper_api::membership::Configuration::new(mset); let req = safekeeper_api::models::TimelineCreateRequest { @@ -1111,6 +1105,26 @@ impl Service { } } + if new_sk_set.is_empty() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "new safekeeper set is empty" + ))); + } + + if new_sk_set.len() < self.config.timeline_safekeeper_count { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "new safekeeper set must have at least {} safekeepers", + self.config.timeline_safekeeper_count + ))); + } + + let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::>(); + let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?; + // Construct new member set in advance to validate it. + // E.g. validates that there is no duplicate safekeepers. + let new_sk_member_set = + Self::make_member_set(&new_safekeepers).map_err(ApiError::BadRequest)?; + // TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks. let _tenant_lock = trace_shared_lock( &self.tenant_op_locks, @@ -1141,6 +1155,18 @@ impl Service { .map(|&id| NodeId(id as u64)) .collect::>(); + // Validate that we are not migrating to a decomissioned safekeeper. + for sk in new_safekeepers.iter() { + if !cur_sk_set.contains(&sk.get_id()) + && sk.scheduling_policy() == SkSchedulingPolicy::Decomissioned + { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "safekeeper {} is decomissioned", + sk.get_id() + ))); + } + } + tracing::info!( ?cur_sk_set, ?new_sk_set, @@ -1183,11 +1209,8 @@ impl Service { } let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?; - let cur_sk_member_set = Self::make_member_set(&cur_safekeepers)?; - - let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::>(); - let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?; - let new_sk_member_set = Self::make_member_set(&new_safekeepers)?; + let cur_sk_member_set = + Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?; let joint_config = membership::Configuration { generation, diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 703ee4b91e..a41906c956 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -34,7 +34,9 @@ class NeonAPI: self.retries524 = 0 self.retries4xx = 0 - def __request(self, method: str | bytes, endpoint: str, **kwargs: Any) -> requests.Response: + def __request( + self, method: str | bytes, endpoint: str, retry404: bool = False, **kwargs: Any + ) -> requests.Response: kwargs["headers"] = kwargs.get("headers", {}) kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}" @@ -55,10 +57,12 @@ class NeonAPI: resp.raise_for_status() break elif resp.status_code >= 400: - if resp.status_code == 422: - if resp.json()["message"] == "branch not ready yet": - retry = True - self.retries4xx += 1 + if resp.status_code == 404 and retry404: + retry = True + self.retries4xx += 1 + elif resp.status_code == 422 and resp.json()["message"] == "branch not ready yet": + retry = True + self.retries4xx += 1 elif resp.status_code == 423 and resp.json()["message"] in { "endpoint is in some transitive state, could not suspend", "project already has running conflicting operations, scheduling of new ones is prohibited", @@ -66,7 +70,7 @@ class NeonAPI: retry = True self.retries4xx += 1 elif resp.status_code == 524: - log.info("The request was timed out, trying to get operations") + log.info("The request was timed out") retry = True self.retries524 += 1 if retry: @@ -203,6 +207,9 @@ class NeonAPI: resp = self.__request( "GET", f"/projects/{project_id}/branches/{branch_id}", + # XXX Retry get parent details to work around the issue + # https://databricks.atlassian.net/browse/LKB-279 + retry404=True, headers={ "Accept": "application/json", }, @@ -317,6 +324,10 @@ class NeonAPI: if endpoint_type: data["endpoint"]["type"] = endpoint_type if settings: + # otherwise we get 400 "settings must not be nil" + # TODO(myrrc): fix on cplane side + if "pg_settings" not in settings: + settings["pg_settings"] = {} data["endpoint"]["settings"] = settings resp = self.__request( diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f54d5be635..b9fff05c6c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1795,6 +1795,33 @@ def neon_env_builder( record_property("preserve_database_files", builder.preserve_database_files) +@pytest.fixture(scope="function") +def neon_env_builder_local( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, + pg_distrib_dir: Path, +) -> NeonEnvBuilder: + """ + Fixture to create a Neon environment for test with its own pg_install copy. + + This allows the test to edit the list of available extensions in the + local instance of Postgres used for the test, and install extensions via + downloading them when a remote extension is tested, for instance, or + copying files around for local extension testing. + """ + test_local_pginstall = test_output_dir / "pg_install" + log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}") + + # We can't copy only the version that we are currently testing because other + # binaries like the storage controller need specific Postgres versions. + shutil.copytree(pg_distrib_dir, test_local_pginstall) + + neon_env_builder.pg_distrib_dir = test_local_pginstall + log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}") + + return neon_env_builder + + @dataclass class PageserverPort: pg: int @@ -2315,6 +2342,20 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return response.json() + # HADRON + def tenant_timeline_describe( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + ): + response = self.request( + "GET", + f"{self.api}/control/v1/tenant/{tenant_id}/timeline/{timeline_id}", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + return response.json() + def nodes(self): """ :return: list of {"id": ""} @@ -5368,6 +5409,7 @@ SKIP_FILES = frozenset( ( "pg_internal.init", "pg.log", + "neon.signal", "zenith.signal", "pg_hba.conf", "postgresql.conf", diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 0e4dd571c0..59249f31ad 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -115,8 +115,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*Local data loss suspected.*", # Too many frozen layers error is normal during intensive benchmarks ".*too many frozen layers.*", - # Transient errors when resolving tenant shards by page service - ".*Fail to resolve tenant shard in attempt.*", + ".*Failed to resolve tenant shard after.*", # Expected warnings when pageserver has not refreshed GC info yet ".*pitr LSN/interval not found, skipping force image creation LSN calculation.*", ".*No broker updates received for a while.*", diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 79cfba8da6..23b9d1c8c9 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -333,6 +333,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys") self.verbose_error(res) + def list_tenant_visible_size(self) -> dict[TenantShardId, int]: + res = self.get(f"http://localhost:{self.port}/v1/list_tenant_visible_size") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def tenant_list(self) -> list[dict[Any, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant") self.verbose_error(res) @@ -1002,7 +1009,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def get_metrics_str(self) -> str: """You probably want to use get_metrics() instead.""" - res = self.get(f"http://localhost:{self.port}/metrics") + res = self.get(f"http://localhost:{self.port}/metrics?use_latest=true") self.verbose_error(res) return res.text diff --git a/test_runner/fixtures/port_distributor.py b/test_runner/fixtures/port_distributor.py index 6a829a9399..e51d08e16e 100644 --- a/test_runner/fixtures/port_distributor.py +++ b/test_runner/fixtures/port_distributor.py @@ -3,6 +3,7 @@ from __future__ import annotations import re import socket from contextlib import closing +from itertools import cycle from fixtures.log_helper import log @@ -34,15 +35,23 @@ def can_bind(host: str, port: int) -> bool: class PortDistributor: def __init__(self, base_port: int, port_number: int): - self.iterator = iter(range(base_port, base_port + port_number)) + self.base_port = base_port + self.port_number = port_number + self.cycle = cycle(range(base_port, base_port + port_number)) self.port_map: dict[int, int] = {} def get_port(self) -> int: - for port in self.iterator: + checked = 0 + for port in self.cycle: if can_bind("localhost", port): return port + elif checked < self.port_number: + checked += 1 + else: + break + raise RuntimeError( - "port range configured for test is exhausted, consider enlarging the range" + f"port range ({self.base_port}..{self.base_port + self.port_number}) configured for test is exhausted, consider enlarging the range" ) def replace_with_new_port(self, value: int | str) -> int | str: diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 942b620be6..ceb00c0f90 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -143,7 +143,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): def get_metrics_str(self) -> str: """You probably want to use get_metrics() instead.""" - request_result = self.get(f"http://localhost:{self.port}/metrics") + request_result = self.get(f"http://localhost:{self.port}/metrics?use_latest=true") request_result.raise_for_status() return request_result.text diff --git a/test_runner/performance/test_lfc_prewarm.py b/test_runner/performance/test_lfc_prewarm.py new file mode 100644 index 0000000000..6c0083de95 --- /dev/null +++ b/test_runner/performance/test_lfc_prewarm.py @@ -0,0 +1,168 @@ +from __future__ import annotations + +import os +import timeit +import traceback +from concurrent.futures import ThreadPoolExecutor as Exec +from pathlib import Path +from time import sleep +from typing import TYPE_CHECKING, Any, cast + +import pytest +from fixtures.benchmark_fixture import NeonBenchmarker, PgBenchRunResult +from fixtures.log_helper import log +from fixtures.neon_api import NeonAPI, connection_parameters_to_env + +if TYPE_CHECKING: + from fixtures.compare_fixtures import NeonCompare + from fixtures.neon_fixtures import Endpoint, PgBin + from fixtures.pg_version import PgVersion + +from performance.test_perf_pgbench import utc_now_timestamp + +# These tests compare performance for a write-heavy and read-heavy workloads of an ordinary endpoint +# compared to the endpoint which saves its LFC and prewarms using it on startup. + + +def test_compare_prewarmed_pgbench_perf(neon_compare: NeonCompare): + env = neon_compare.env + env.create_branch("normal") + env.create_branch("prewarmed") + pg_bin = neon_compare.pg_bin + ep_normal: Endpoint = env.endpoints.create_start("normal") + ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True) + + for ep in [ep_normal, ep_prewarmed]: + connstr: str = ep.connstr() + pg_bin.run(["pgbench", "-i", "-I", "dtGvp", connstr, "-s100"]) + ep.safe_psql("CREATE EXTENSION neon") + client = ep.http_client() + client.offload_lfc() + ep.stop() + ep.start() + client.prewarm_lfc_wait() + + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = pg_bin.run_capture(["pgbench", "-c10", "-T10", connstr]) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + + stdout = Path(f"{out}.stdout").read_text() + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + name: str = cast("str", ep.branch_name) + neon_compare.zenbenchmark.record_pg_bench_result(name, res) + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_compare_prewarmed_pgbench_perf_benchmark( + pg_bin: PgBin, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + name = f"Test prewarmed pgbench performance, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" + project = neon_api.create_project(pg_version, name) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + err = False + try: + benchmark_impl(pg_bin, neon_api, project, zenbenchmark) + except Exception as e: + err = True + log.error(f"Caught exception: {e}") + log.error(traceback.format_exc()) + finally: + assert not err + neon_api.delete_project(project_id) + + +def benchmark_impl( + pg_bin: PgBin, neon_api: NeonAPI, project: dict[str, Any], zenbenchmark: NeonBenchmarker +): + pgbench_size = int(os.getenv("PGBENCH_SIZE") or "3424") # 50GB + offload_secs = 20 + test_duration_min = 5 + pgbench_duration = f"-T{test_duration_min * 60}" + # prewarm API is not publicly exposed. In order to test performance of a + # fully prewarmed endpoint, wait after it restarts. + # The number here is empirical, based on manual runs on staging + prewarmed_sleep_secs = 180 + + branch_id = project["branch"]["id"] + project_id = project["project"]["id"] + normal_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + normal_id = project["endpoints"][0]["id"] + + prewarmed_branch_id = neon_api.create_branch( + project_id, "prewarmed", parent_id=branch_id, add_endpoint=False + )["branch"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + + ep_prewarmed = neon_api.create_endpoint( + project_id, + prewarmed_branch_id, + endpoint_type="read_write", + settings={"autoprewarm": True, "offload_lfc_interval_seconds": offload_secs}, + ) + neon_api.wait_for_operation_to_finish(project_id) + + prewarmed_env = normal_env.copy() + prewarmed_env["PGHOST"] = ep_prewarmed["endpoint"]["host"] + prewarmed_id = ep_prewarmed["endpoint"]["id"] + + def bench(endpoint_name, endpoint_id, env): + pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{pgbench_size}"], env) + sleep(offload_secs * 2) # ensure LFC is offloaded after pgbench finishes + neon_api.restart_endpoint(project_id, endpoint_id) + sleep(prewarmed_sleep_secs) + + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = pg_bin.run_capture(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + + stdout = Path(f"{out}.stdout").read_text() + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + zenbenchmark.record_pg_bench_result(endpoint_name, res) + + with Exec(max_workers=2) as exe: + exe.submit(bench, "normal", normal_id, normal_env) + exe.submit(bench, "prewarmed", prewarmed_id, prewarmed_env) + + +def test_compare_prewarmed_read_perf(neon_compare: NeonCompare): + env = neon_compare.env + env.create_branch("normal") + env.create_branch("prewarmed") + ep_normal: Endpoint = env.endpoints.create_start("normal") + ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True) + + sql = [ + "CREATE EXTENSION neon", + "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')", + "INSERT INTO foo SELECT FROM generate_series(1,1000000)", + ] + for ep in [ep_normal, ep_prewarmed]: + ep.safe_psql_many(sql) + client = ep.http_client() + client.offload_lfc() + ep.stop() + ep.start() + client.prewarm_lfc_wait() + with neon_compare.record_duration(f"{ep.branch_name}_run_duration"): + ep.safe_psql("SELECT count(*) from foo") diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py index 63da47b555..aa21637159 100644 --- a/test_runner/random_ops/test_random_ops.py +++ b/test_runner/random_ops/test_random_ops.py @@ -13,7 +13,6 @@ from typing import TYPE_CHECKING, Any import pytest from fixtures.log_helper import log -from requests import HTTPError if TYPE_CHECKING: from pathlib import Path @@ -204,26 +203,11 @@ class NeonBranch: self.updated_at = datetime.fromisoformat(res["branch"]["updated_at"]) self.parent_timestamp = datetime.fromisoformat(res["branch"]["parent_timestamp"]) parent_id: str = res["branch"]["parent_id"] - # XXX Retry get parent details to work around the issue - # https://databricks.atlassian.net/browse/LKB-279 - target_time = datetime.now() + timedelta(seconds=30) - while datetime.now() < target_time: - try: - parent_def = self.neon_api.get_branch_details(self.project_id, parent_id) - except HTTPError as he: - if he.response.status_code == 404: - log.info("Branch not found, waiting...") - time.sleep(1) - else: - raise HTTPError(he) from he - else: - break - else: - raise RuntimeError(f"Branch {parent_id} not found") - # Creates an object for the parent branch # After the reset operation a new parent branch is created - parent = NeonBranch(self.project, parent_def, True) + parent = NeonBranch( + self.project, self.neon_api.get_branch_details(self.project_id, parent_id), True + ) self.project.branches[parent_id] = parent self.parent = parent parent.children[self.id] = self diff --git a/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql new file mode 100644 index 0000000000..2b82102802 --- /dev/null +++ b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql @@ -0,0 +1,32 @@ +\echo Use "CREATE EXTENSION test_event_trigger_extension" to load this file. \quit + +CREATE SCHEMA event_trigger; + +create sequence if not exists event_trigger.seq_schema_version as int cycle; + +create or replace function event_trigger.increment_schema_version() + returns event_trigger + security definer + language plpgsql +as $$ +begin + perform pg_catalog.nextval('event_trigger.seq_schema_version'); +end; +$$; + +create or replace function event_trigger.get_schema_version() + returns int + security definer + language sql +as $$ + select last_value from event_trigger.seq_schema_version; +$$; + +-- On DDL event, increment the schema version number +create event trigger event_trigger_watch_ddl + on ddl_command_end + execute procedure event_trigger.increment_schema_version(); + +create event trigger event_trigger_watch_drop + on sql_drop + execute procedure event_trigger.increment_schema_version(); diff --git a/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control new file mode 100644 index 0000000000..4fe8c3341b --- /dev/null +++ b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control @@ -0,0 +1,8 @@ +default_version = '1.0' +comment = 'Test extension with Event Trigger' + +# make sure the extension objects are owned by the bootstrap user +# to check that the SECURITY DEFINER event trigger function is still +# called during non-superuser DDL events. +superuser = true +trusted = true diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 7788faceb4..eaaa3014a5 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -165,6 +165,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "gc_horizon": 23 * (1024 * 1024), "gc_period": "2h 13m", "image_creation_threshold": 7, + "image_layer_force_creation_period": "1m", "pitr_interval": "1m", "lagging_wal_timeout": "23m", "lazy_slru_download": True, diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index 8447c9bf2d..148f469a95 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING import pytest from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log +from fixtures.neon_fixtures import wait_for_last_flush_lsn from fixtures.pageserver.http import TimelineCreate406 from fixtures.utils import query_scalar, skip_in_debug_build @@ -162,6 +163,9 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): ) lsn = Lsn(res[2][0][0]) + # Wait for all WAL to reach the pageserver, so GC cutoff LSN is greater than `lsn`. + wait_for_last_flush_lsn(env, endpoint0, tenant, b0) + # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the # branch creation task but the individual timeline GC iteration happens *after* # the branch creation task. diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 1570d40ae9..963a19d640 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -7,6 +7,7 @@ import time from enum import StrEnum import pytest +from fixtures.common_types import TenantShardId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -944,3 +945,204 @@ def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)" ) assert res[0][0] == 1 + + +# BEGIN_HADRON +def get_layer_map(env, tenant_shard_id, timeline_id, ps_id): + client = env.pageservers[ps_id].http_client() + layer_map = client.layer_map_info(tenant_shard_id, timeline_id) + image_layer_count = 0 + delta_layer_count = 0 + for layer in layer_map.historic_layers: + if layer.kind == "Image": + image_layer_count += 1 + elif layer.kind == "Delta": + delta_layer_count += 1 + return image_layer_count, delta_layer_count + + +def test_image_layer_creation_time_threshold(neon_env_builder: NeonEnvBuilder): + """ + Tests that image layers can be created when the time threshold is reached on non-0 shards. + """ + tenant_conf = { + "compaction_threshold": "100", + "image_creation_threshold": "100", + "image_layer_creation_check_threshold": "1", + # disable distance based image layer creation check + "checkpoint_distance": 10 * 1024 * 1024 * 1024, + "checkpoint_timeout": "100ms", + "image_layer_force_creation_period": "1s", + "pitr_interval": "10s", + "gc_period": "1s", + "compaction_period": "1s", + "lsn_lease_length": "1s", + } + + # consider every tenant large to run the image layer generation check more eagerly + neon_env_builder.pageserver_config_override = ( + "image_layer_generation_large_timeline_threshold=0" + ) + + neon_env_builder.num_pageservers = 1 + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start( + initial_tenant_conf=tenant_conf, + initial_tenant_shard_count=2, + initial_tenant_shard_stripe_size=1, + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start("main") + endpoint.safe_psql("CREATE TABLE foo (id INTEGER, val text)") + + for v in range(10): + endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))") + + tenant_shard_id = TenantShardId(tenant_id, 1, 2) + + # Generate some rows. + for v in range(20): + endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))") + + # restart page server so that logical size on non-0 shards is missing + env.pageserver.restart() + + (old_images, old_deltas) = get_layer_map(env, tenant_shard_id, timeline_id, 0) + log.info(f"old images: {old_images}, old deltas: {old_deltas}") + + def check_image_creation(): + (new_images, old_deltas) = get_layer_map(env, tenant_shard_id, timeline_id, 0) + log.info(f"images: {new_images}, deltas: {old_deltas}") + assert new_images > old_images + + wait_until(check_image_creation) + + endpoint.stop_and_destroy() + + +def test_image_layer_force_creation_period(neon_env_builder: NeonEnvBuilder): + """ + Tests that page server can force creating new images if image_layer_force_creation_period is enabled + """ + # use large knobs to disable L0 compaction/image creation except for the force image creation + tenant_conf = { + "compaction_threshold": "100", + "image_creation_threshold": "100", + "image_layer_creation_check_threshold": "1", + "checkpoint_distance": 10 * 1024, + "checkpoint_timeout": "1s", + "image_layer_force_creation_period": "1s", + "pitr_interval": "10s", + "gc_period": "1s", + "compaction_period": "1s", + "lsn_lease_length": "1s", + } + + # consider every tenant large to run the image layer generation check more eagerly + neon_env_builder.pageserver_config_override = ( + "image_layer_generation_large_timeline_threshold=0" + ) + + neon_env_builder.num_pageservers = 1 + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create_start("main") + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + # Generate some rows. + for v in range(10): + endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))") + + # Sleep a bit such that the inserts are considered when calculating the forced image layer creation LSN. + time.sleep(2) + + def check_force_image_creation(): + ps_http = env.pageserver.http_client() + ps_http.timeline_compact(tenant_id, timeline_id) + image, delta = get_layer_map(env, tenant_id, timeline_id, 0) + log.info(f"images: {image}, deltas: {delta}") + assert image > 0 + + env.pageserver.assert_log_contains("forcing L0 compaction of") + env.pageserver.assert_log_contains("forcing image creation for partitioned range") + + wait_until(check_force_image_creation) + + endpoint.stop_and_destroy() + + env.pageserver.allowed_errors.append( + ".*created delta file of size.*larger than double of target.*" + ) + + +def test_image_consistent_lsn(neon_env_builder: NeonEnvBuilder): + """ + Test the /v1/tenant//timeline/ endpoint and the computation of image_consistent_lsn + """ + # use large knobs to disable L0 compaction/image creation except for the force image creation + tenant_conf = { + "compaction_threshold": "100", + "image_creation_threshold": "100", + "image_layer_creation_check_threshold": "1", + "checkpoint_distance": 10 * 1024, + "checkpoint_timeout": "1s", + "image_layer_force_creation_period": "1s", + "pitr_interval": "10s", + "gc_period": "1s", + "compaction_period": "1s", + "lsn_lease_length": "1s", + } + + neon_env_builder.num_pageservers = 2 + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start( + initial_tenant_conf=tenant_conf, + initial_tenant_shard_count=4, + initial_tenant_shard_stripe_size=1, + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create_start("main") + endpoint.safe_psql("CREATE TABLE foo (id INTEGER, val text)") + for v in range(10): + endpoint.safe_psql( + f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))", log_query=False + ) + + response = env.storage_controller.tenant_timeline_describe(tenant_id, timeline_id) + shards = response["shards"] + for shard in shards: + assert shard["image_consistent_lsn"] is not None + image_consistent_lsn = response["image_consistent_lsn"] + assert image_consistent_lsn is not None + + # do more writes and wait for image_consistent_lsn to advance + for v in range(100): + endpoint.safe_psql( + f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))", log_query=False + ) + + def check_image_consistent_lsn_advanced(): + response = env.storage_controller.tenant_timeline_describe(tenant_id, timeline_id) + new_image_consistent_lsn = response["image_consistent_lsn"] + shards = response["shards"] + for shard in shards: + print(f"shard {shard['tenant_id']} image_consistent_lsn{shard['image_consistent_lsn']}") + assert new_image_consistent_lsn != image_consistent_lsn + + wait_until(check_image_consistent_lsn_advanced) + + endpoint.stop_and_destroy() + + for ps in env.pageservers: + ps.allowed_errors.append(".*created delta file of size.*larger than double of target.*") + + +# END_HADRON diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index fe3b220c67..d7f78afac8 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -2,7 +2,6 @@ from __future__ import annotations import os import platform -import shutil import tarfile from enum import StrEnum from pathlib import Path @@ -31,27 +30,6 @@ if TYPE_CHECKING: from werkzeug.wrappers.request import Request -# use neon_env_builder_local fixture to override the default neon_env_builder fixture -# and use a test-specific pg_install instead of shared one -@pytest.fixture(scope="function") -def neon_env_builder_local( - neon_env_builder: NeonEnvBuilder, - test_output_dir: Path, - pg_distrib_dir: Path, -) -> NeonEnvBuilder: - test_local_pginstall = test_output_dir / "pg_install" - log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}") - - # We can't copy only the version that we are currently testing because other - # binaries like the storage controller need specific Postgres versions. - shutil.copytree(pg_distrib_dir, test_local_pginstall) - - neon_env_builder.pg_distrib_dir = test_local_pginstall - log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}") - - return neon_env_builder - - @final class RemoteExtension(StrEnum): SQL_ONLY = "test_extension_sql_only" diff --git a/test_runner/regress/test_event_trigger_extension.py b/test_runner/regress/test_event_trigger_extension.py new file mode 100644 index 0000000000..ac4351dcd5 --- /dev/null +++ b/test_runner/regress/test_event_trigger_extension.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import shutil +from pathlib import Path +from typing import TYPE_CHECKING, cast + +import pytest +from fixtures.log_helper import log +from fixtures.paths import BASE_DIR + +if TYPE_CHECKING: + from pathlib import Path + + from fixtures.neon_fixtures import ( + NeonEnvBuilder, + ) + from fixtures.pg_version import PgVersion + + +# use neon_env_builder_local fixture to override the default neon_env_builder fixture +# and use a test-specific pg_install instead of shared one +@pytest.fixture(scope="function") +def neon_env_builder_event_trigger_extension( + neon_env_builder_local: NeonEnvBuilder, + test_output_dir: Path, + pg_version: PgVersion, +) -> NeonEnvBuilder: + test_local_pginstall = test_output_dir / "pg_install" + + # Now copy the SQL only extension test_event_trigger_extension in the local + # pginstall extension directory on-disk + test_event_trigger_extension_dir = ( + BASE_DIR / "test_runner" / "regress" / "data" / "test_event_trigger_extension" + ) + + test_local_extension_dir = ( + test_local_pginstall / f"v{pg_version}" / "share" / "postgresql" / "extension" + ) + + log.info(f"copy {test_event_trigger_extension_dir} to {test_local_extension_dir}") + + for f in [ + test_event_trigger_extension_dir / "test_event_trigger_extension.control", + test_event_trigger_extension_dir / "test_event_trigger_extension--1.0.sql", + ]: + shutil.copy(f, test_local_extension_dir) + + return neon_env_builder_local + + +def test_event_trigger_extension(neon_env_builder_event_trigger_extension: NeonEnvBuilder): + """ + Test installing an extension that contains an Event Trigger. + + The Event Trigger function is owned by the extension owner, which at + CREATE EXTENSION is going to be the Postgres bootstrap user, per the + extension control file where both superuser = true and trusted = true. + + Also this function is SECURTY DEFINER, to allow for making changes to + the extension SQL objects, in our case a sequence. + + This test makes sure that the event trigger function is fired correctly + by non-privileged user DDL actions such as CREATE TABLE. + """ + env = neon_env_builder_event_trigger_extension.init_start() + env.create_branch("test_event_trigger_extension") + + endpoint = env.endpoints.create_start("test_event_trigger_extension") + extension = "test_event_trigger_extension" + database = "test_event_trigger_extension" + + endpoint.safe_psql(f"CREATE DATABASE {database}") + endpoint.safe_psql(f"CREATE EXTENSION {extension}", dbname=database) + + # check that the extension is owned by the bootstrap superuser (cloud_admin) + pg_bootstrap_superuser_name = "cloud_admin" + with endpoint.connect(dbname=database) as pg_conn: + with pg_conn.cursor() as cur: + cur.execute( + f"select rolname from pg_roles r join pg_extension e on r.oid = e.extowner where extname = '{extension}'" + ) + owner = cast("tuple[str]", cur.fetchone())[0] + assert owner == pg_bootstrap_superuser_name, ( + f"extension {extension} is not owned by bootstrap user '{pg_bootstrap_superuser_name}'" + ) + + # test that the SQL-only Event Trigger (SECURITY DEFINER function) runs + # correctly now that the extension has been installed + # + # create table to trigger the event trigger, twice, check sequence count + with endpoint.connect(dbname=database) as pg_conn: + log.info("creating SQL objects (tables)") + with pg_conn.cursor() as cur: + cur.execute("CREATE TABLE foo1(id int primary key)") + cur.execute("CREATE TABLE foo2(id int)") + + cur.execute("SELECT event_trigger.get_schema_version()") + res = cast("tuple[int]", cur.fetchone()) + ver = res[0] + + log.info(f"schema version is now {ver}") + assert ver == 2, "schema version is not 2" diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py index ae36bbda79..0f0cf4cc6d 100644 --- a/test_runner/regress/test_lfc_prewarm.py +++ b/test_runner/regress/test_lfc_prewarm.py @@ -1,6 +1,7 @@ import random import threading from enum import StrEnum +from time import sleep from typing import Any import pytest @@ -24,18 +25,7 @@ OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total" OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total" METHOD_VALUES = [e for e in PrewarmMethod] METHOD_IDS = [e.value for e in PrewarmMethod] - - -def check_pinned_entries(cur: Cursor): - """ - Wait till none of LFC buffers are pinned - """ - - def none_pinned(): - cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'") - assert cur.fetchall()[0][0] == 0 - - wait_until(none_pinned) +AUTOOFFLOAD_INTERVAL_SECS = 2 def prom_parse(client: EndpointHttpClient) -> dict[str, float]: @@ -49,9 +39,18 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]: def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any: + if method == PrewarmMethod.POSTGRES: + cur.execute("select neon.get_local_cache_state()") + return cur.fetchall()[0][0] + if method == PrewarmMethod.AUTOPREWARM: + # With autoprewarm, we need to be sure LFC was offloaded after all writes + # finish, so we sleep. Otherwise we'll have less prewarmed pages than we want + sleep(AUTOOFFLOAD_INTERVAL_SECS) client.offload_lfc_wait() - elif method == PrewarmMethod.COMPUTE_CTL: + return + + if method == PrewarmMethod.COMPUTE_CTL: status = client.prewarm_lfc_status() assert status["status"] == "not_prewarmed" assert "error" not in status @@ -60,11 +59,9 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) parsed = prom_parse(client) desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0} assert parsed == desired, f"{parsed=} != {desired=}" - elif method == PrewarmMethod.POSTGRES: - cur.execute("select get_local_cache_state()") - return cur.fetchall()[0][0] - else: - raise AssertionError(f"{method} not in PrewarmMethod") + return + + raise AssertionError(f"{method} not in PrewarmMethod") def prewarm_endpoint( @@ -75,7 +72,7 @@ def prewarm_endpoint( elif method == PrewarmMethod.COMPUTE_CTL: client.prewarm_lfc() elif method == PrewarmMethod.POSTGRES: - cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + cur.execute("select neon.prewarm_local_cache(%s)", (lfc_state,)) def check_prewarmed( @@ -106,21 +103,20 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): "neon.file_cache_size_limit=1GB", "neon.file_cache_prewarm_limit=1000", ] - offload_secs = 2 if method == PrewarmMethod.AUTOPREWARM: endpoint = env.endpoints.create_start( branch_name="main", config_lines=cfg, autoprewarm=True, - offload_lfc_interval_seconds=offload_secs, + offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS, ) else: endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg) pg_conn = endpoint.connect() pg_cur = pg_conn.cursor() - pg_cur.execute("create extension neon") + pg_cur.execute("create schema neon; create extension neon with schema neon") pg_cur.execute("create database lfc") lfc_conn = endpoint.connect(dbname="lfc") @@ -135,7 +131,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): endpoint.stop() if method == PrewarmMethod.AUTOPREWARM: - endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs) + endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS) else: endpoint.start() @@ -146,10 +142,12 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): lfc_cur = lfc_conn.cursor() prewarm_endpoint(method, client, pg_cur, lfc_state) - pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'") + pg_cur.execute( + "select lfc_value from neon.neon_lfc_stats where lfc_key='file_cache_used_pages'" + ) lfc_used_pages = pg_cur.fetchall()[0][0] log.info(f"Used LFC size: {lfc_used_pages}") - pg_cur.execute("select * from get_prewarm_info()") + pg_cur.execute("select * from neon.get_prewarm_info()") total, prewarmed, skipped, _ = pg_cur.fetchall()[0] log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}") progress = (prewarmed + skipped) * 100 // total @@ -162,7 +160,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): lfc_cur.execute("select sum(pk) from t") assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 - check_pinned_entries(pg_cur) desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped} check_prewarmed(method, client, desired) @@ -191,7 +188,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet pg_conn = endpoint.connect() pg_cur = pg_conn.cursor() - pg_cur.execute("create extension neon") + pg_cur.execute("create schema neon; create extension neon with schema neon") pg_cur.execute("CREATE DATABASE lfc") lfc_conn = endpoint.connect(dbname="lfc") @@ -243,9 +240,9 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet prewarm_thread.start() def prewarmed(): - assert n_prewarms > 5 + assert n_prewarms > 3 - wait_until(prewarmed) + wait_until(prewarmed, timeout=40) # debug builds don't finish in 20s running = False for t in workload_threads: @@ -256,7 +253,6 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet total_balance = lfc_cur.fetchall()[0][0] assert total_balance == 0 - check_pinned_entries(pg_cur) if method == PrewarmMethod.POSTGRES: return desired = { diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 7f9207047e..92889e5de3 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -3,6 +3,7 @@ from __future__ import annotations from typing import TYPE_CHECKING from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -164,3 +165,15 @@ def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder {"rel_size_migration": "legacy"}, ) assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy" + + +def test_pageserver_get_tenant_visible_size(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_pageservers = 1 + env = neon_env_builder.init_start() + env.create_tenant(shard_count=4) + env.create_tenant(shard_count=2) + + json = env.pageserver.http_client().list_tenant_visible_size() + log.info(f"{json}") + # initial tennat + 2 newly created tenants + assert len(json) == 7 diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py index 1f26269f40..8d39ac123a 100644 --- a/test_runner/regress/test_replica_promotes.py +++ b/test_runner/regress/test_replica_promotes.py @@ -60,7 +60,7 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod): with primary.connect() as primary_conn: primary_cur = primary_conn.cursor() - primary_cur.execute("create extension neon") + primary_cur.execute("create schema neon;create extension neon with schema neon") primary_cur.execute( "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)" ) @@ -172,7 +172,7 @@ def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv): secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") with primary.connect() as conn, conn.cursor() as cur: - cur.execute("create extension neon") + cur.execute("create schema neon;create extension neon with schema neon") cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)") cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)") cur.execute("show neon.safekeepers") diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py index b82d7b9bb0..170c1a3650 100644 --- a/test_runner/regress/test_safekeeper_migration.py +++ b/test_runner/regress/test_safekeeper_migration.py @@ -2,6 +2,9 @@ from __future__ import annotations from typing import TYPE_CHECKING +import pytest +from fixtures.neon_fixtures import StorageControllerApiException + if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnvBuilder @@ -75,3 +78,38 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): ep.start(safekeeper_generation=1, safekeepers=[3]) assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)] + + +def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder): + """ + Test that safekeeper_migrate validates the new_sk_set before starting the migration. + """ + neon_env_builder.num_safekeepers = 3 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 2, + } + env = neon_env_builder.init_start() + + def expect_fail(sk_set: list[int], match: str): + with pytest.raises(StorageControllerApiException, match=match): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, sk_set + ) + # Check that we failed before commiting to the database. + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["generation"] == 1 + + expect_fail([], "safekeeper set is empty") + expect_fail([1], "must have at least 2 safekeepers") + expect_fail([1, 1], "duplicate safekeeper") + expect_fail([1, 100500], "does not exist") + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + sk_set = mconf["sk_set"] + assert len(sk_set) == 2 + + decom_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0] + env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned") + + expect_fail([sk_set[0], decom_sk], "decomissioned") diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 8ff767eca4..5549105188 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1673,6 +1673,91 @@ def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder): # END_HADRON +# HADRON +@pytest.mark.skip(reason="The backpressure change has not been merged yet.") +def test_back_pressure_per_shard(neon_env_builder: NeonEnvBuilder): + """ + Tests back pressure knobs are enforced on the per shard basis instead of at the tenant level. + """ + init_shard_count = 4 + neon_env_builder.num_pageservers = init_shard_count + stripe_size = 1 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=init_shard_count, + initial_tenant_shard_stripe_size=stripe_size, + initial_tenant_conf={ + # disable auto-flush of shards and set max_replication_flush_lag as 15MB. + # The backpressure parameters must be enforced at the shard level to avoid stalling PG. + "checkpoint_distance": 1 * 1024 * 1024 * 1024, + "checkpoint_timeout": "1h", + }, + ) + + endpoint = env.endpoints.create( + "main", + config_lines=[ + "max_replication_write_lag = 0", + "max_replication_apply_lag = 0", + "max_replication_flush_lag = 15MB", + "neon.max_cluster_size = 10GB", + ], + ) + endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created. + endpoint.start() + + # generate 20MB of data + endpoint.safe_psql( + "CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, 20000) s;" + ) + res = endpoint.safe_psql( + "SELECT neon.backpressure_throttling_time() as throttling_time", dbname="databricks_system" + )[0] + assert res[0] == 0, f"throttling_time should be 0, but got {res[0]}" + + endpoint.stop() + + +# HADRON +def test_shard_split_page_server_timeout(neon_env_builder: NeonEnvBuilder): + """ + Tests that shard split can correctly handle page server timeouts and abort the split + """ + init_shard_count = 2 + neon_env_builder.num_pageservers = 1 + stripe_size = 1 + + if neon_env_builder.storage_controller_config is None: + neon_env_builder.storage_controller_config = {"shard_split_request_timeout": "5s"} + else: + neon_env_builder.storage_controller_config["shard_split_request_timeout"] = "5s" + + env = neon_env_builder.init_start( + initial_tenant_shard_count=init_shard_count, + initial_tenant_shard_stripe_size=stripe_size, + ) + + env.storage_controller.allowed_errors.extend( + [ + ".*Enqueuing background abort.*", + ".*failpoint.*", + ".*Failed to abort.*", + ".*Exclusive lock by ShardSplit was held.*", + ] + ) + env.pageserver.allowed_errors.extend([".*request was dropped before completing.*"]) + + endpoint1 = env.endpoints.create_start(branch_name="main") + + env.pageserver.http_client().configure_failpoints(("shard-split-post-finish-pause", "pause")) + + with pytest.raises(StorageControllerApiException): + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=4) + + env.pageserver.http_client().configure_failpoints(("shard-split-post-finish-pause", "off")) + endpoint1.stop_and_destroy() + + def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): """ Check a scenario when one of the shards is much slower than others. diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index c0f163db32..45b7af719e 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -209,9 +209,9 @@ def test_ancestor_detach_branched_from( client.timeline_delete(env.initial_tenant, env.initial_timeline) wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline) - # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different - # as there is always "PREV_LSN: invalid" for "before" - skip_files = {"zenith.signal"} + # because we do the fullbackup from ancestor at the branch_lsn, the neon.signal and/or zenith.signal is always + # different as there is always "PREV_LSN: invalid" for "before" + skip_files = {"zenith.signal", "neon.signal"} assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files) @@ -767,7 +767,7 @@ def test_compaction_induced_by_detaches_in_history( env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after ) - # we don't need to skip any files, because zenith.signal will be identical + # we don't need to skip any files, because neon.signal will be identical assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set()) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 0bb63308bb..573016f772 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -3,6 +3,7 @@ from __future__ import annotations import sys import tarfile import tempfile +from pathlib import Path from typing import TYPE_CHECKING import pytest @@ -198,3 +199,115 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool) # the table is back now! restored = env.endpoints.create_start("main") assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] + + +# BEGIN_HADRON +# TODO: re-enable once CM python is integreated. +# def clear_directory(directory): +# for item in os.listdir(directory): +# item_path = os.path.join(directory, item) +# if os.path.isdir(item_path): +# log.info(f"removing SK directory: {item_path}") +# shutil.rmtree(item_path) +# else: +# log.info(f"removing SK file: {item_path}") +# os.remove(item_path) + + +# def test_sk_pull_timelines( +# neon_env_builder: NeonEnvBuilder, +# ): +# DBNAME = "regression" +# superuser_name = "databricks_superuser" +# neon_env_builder.num_safekeepers = 3 +# neon_env_builder.num_pageservers = 4 +# neon_env_builder.safekeeper_extra_opts = ["--enable-pull-timeline-on-startup"] +# neon_env_builder.enable_safekeeper_remote_storage(s3_storage()) + +# env = neon_env_builder.init_start(initial_tenant_shard_count=4) + +# env.compute_manager.start(base_port=env.compute_manager_port) + +# test_creator = "test_creator" +# test_metastore_id = uuid4() +# test_account_id = uuid4() +# test_workspace_id = 1 +# test_workspace_url = "http://test_workspace_url" +# test_metadata_version = 1 +# test_metadata = { +# "state": "INSTANCE_PROVISIONING", +# "admin_rolename": "admin", +# "admin_password_scram": "abc123456", +# } + +# test_instance_name_1 = "test_instance_1" +# test_instance_read_write_compute_pool_1 = { +# "instance_name": test_instance_name_1, +# "compute_pool_name": "compute_pool_1", +# "creator": test_creator, +# "capacity": 2.0, +# "node_count": 1, +# "metadata_version": 0, +# "metadata": { +# "state": "INSTANCE_PROVISIONING", +# }, +# } + +# test_instance_1_readable_secondaries_enabled = False + +# # Test creation +# create_instance_with_retries( +# env, +# test_instance_name_1, +# test_creator, +# test_metastore_id, +# test_account_id, +# test_workspace_id, +# test_workspace_url, +# test_instance_read_write_compute_pool_1, +# test_metadata_version, +# test_metadata, +# test_instance_1_readable_secondaries_enabled, +# ) +# instance = env.compute_manager.get_instance_by_name(test_instance_name_1, test_workspace_id) +# log.info(f"haoyu Instance created: {instance}") +# assert instance["instance_name"] == test_instance_name_1 +# test_instance_id = instance["instance_id"] +# instance_detail = env.compute_manager.describe_instance(test_instance_id) +# log.info(f"haoyu Instance detail: {instance_detail}") + +# env.initial_tenant = instance_detail[0]["tenant_id"] +# env.initial_timeline = instance_detail[0]["timeline_id"] + +# # Connect to postgres and create a database called "regression". +# endpoint = env.endpoints.create_start("main") +# endpoint.safe_psql(f"CREATE ROLE {superuser_name}") +# endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") + +# endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);") +# # Write some data. ~20 MB. +# num_rows = 0 +# for _i in range(0, 20000): +# endpoint.safe_psql( +# "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False +# ) +# num_rows += 1 + +# log.info(f"SKs {env.storage_controller.hcc_sk_node_list()}") + +# env.safekeepers[0].stop(immediate=True) +# clear_directory(env.safekeepers[0].data_dir) +# env.safekeepers[0].start() + +# # PG can still write data. ~20 MB. +# for _i in range(0, 20000): +# endpoint.safe_psql( +# "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False +# ) +# num_rows += 1 + +# tuples = endpoint.safe_psql("SELECT COUNT(*) FROM usertable;") +# assert tuples[0][0] == num_rows +# endpoint.stop_and_destroy() + +# END_HADRON diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 9085654ee8..8ce1f52303 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 9085654ee8022d5cc4ca719380a1dc53e5e3246f +Subproject commit 8ce1f52303aec29e098309347b57c01a1962e221 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 8c3249f36c..afd46987f3 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9 +Subproject commit afd46987f3da50c9146a8aa59380052df0862c06 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 7a4c0eacae..e08c8d5f15 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 7a4c0eacaeb9b97416542fa19103061c166460b1 +Subproject commit e08c8d5f1576ca0487d14d154510499c5f12adfb diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index db424d42d7..353c725b0c 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit db424d42d748f8ad91ac00e28db2c7f2efa42f7f +Subproject commit 353c725b0c76cc82b15af21d8360d03391dc6814 diff --git a/vendor/revisions.json b/vendor/revisions.json index b260698c86..992aa405b1 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.5", - "db424d42d748f8ad91ac00e28db2c7f2efa42f7f" + "353c725b0c76cc82b15af21d8360d03391dc6814" ], "v16": [ "16.9", - "7a4c0eacaeb9b97416542fa19103061c166460b1" + "e08c8d5f1576ca0487d14d154510499c5f12adfb" ], "v15": [ "15.13", - "8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9" + "afd46987f3da50c9146a8aa59380052df0862c06" ], "v14": [ "14.18", - "9085654ee8022d5cc4ca719380a1dc53e5e3246f" + "8ce1f52303aec29e098309347b57c01a1962e221" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index fc01deb92d..c61598cdf6 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -98,7 +98,7 @@ tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unpref time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["full", "test-util"] } tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } -tokio-stream = { version = "0.1", features = ["net"] } +tokio-stream = { version = "0.1", features = ["net", "sync"] } tokio-util = { version = "0.7", features = ["codec", "compat", "io-util", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "log"] }