diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 864abad574..cc9534f05d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -87,6 +87,24 @@ jobs: uses: ./.github/workflows/build-build-tools-image.yml secrets: inherit + lint-openapi-spec: + runs-on: ubuntu-22.04 + needs: [ meta, check-permissions ] + # We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - run: make lint-openapi-spec + check-codestyle-python: needs: [ meta, check-permissions, build-build-tools-image ] # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. diff --git a/Cargo.lock b/Cargo.lock index 237defaec3..caed814d5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1348,6 +1348,7 @@ dependencies = [ "p256 0.13.2", "pageserver_page_api", "postgres", + "postgres-types", "postgres_initdb", "postgres_versioninfo", "regex", @@ -4339,6 +4340,7 @@ dependencies = [ "arc-swap", "async-compression", "async-stream", + "base64 0.22.1", "bincode", "bit_field", "byteorder", @@ -4492,6 +4494,24 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_client_grpc" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "compute_api", + "futures", + "pageserver_api", + "pageserver_page_api", + "tokio", + "tokio-stream", + "tonic 0.13.1", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "pageserver_compaction" version = "0.1.0" @@ -5684,6 +5704,8 @@ dependencies = [ "azure_identity", "azure_storage", "azure_storage_blobs", + "base64 0.22.1", + "byteorder", "bytes", "camino", "camino-tempfile", diff --git a/Cargo.toml b/Cargo.toml index 840e3c6036..14f2cfcb56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ members = [ "pageserver/compaction", "pageserver/ctl", "pageserver/client", + "pageserver/client_grpc", "pageserver/pagebench", "pageserver/page_api", "proxy", diff --git a/Makefile b/Makefile index 4b31e26810..d07ac907b4 100644 --- a/Makefile +++ b/Makefile @@ -220,6 +220,15 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17 setup-pre-commit-hook: ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit +.PHONY: lint-openapi-spec +lint-openapi-spec: + # operation-2xx-response: pageserver timeline delete returns 404 on success + find . -iname "openapi_spec.y*ml" -exec\ + docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\ + --skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\ + --skip-rule=no-server-example.com --skip-rule=operation-2xx-response\ + lint {} \+ + # Targets for building PostgreSQL are defined in postgres.mk. # # But if the caller has indicated that PostgreSQL is already diff --git a/clippy.toml b/clippy.toml index 408232488c..c03059053a 100644 --- a/clippy.toml +++ b/clippy.toml @@ -1,9 +1,12 @@ disallowed-methods = [ "tokio::task::block_in_place", + # Allow this for now, to deny it later once we stop using Handle::block_on completely # "tokio::runtime::Handle::block_on", - # use tokio_epoll_uring_ext instead - "tokio_epoll_uring::thread_local_system", + + # tokio-epoll-uring: + # - allow-invalid because the method doesn't exist on macOS + { path = "tokio_epoll_uring::thread_local_system", replacement = "tokio_epoll_uring_ext module inside pageserver crate", allow-invalid = true } ] disallowed-macros = [ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 1a03022d89..910bae3bda 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -66,7 +66,7 @@ url.workspace = true uuid.workspace = true walkdir.workspace = true x509-cert.workspace = true - +postgres-types.workspace = true postgres_versioninfo.workspace = true postgres_initdb.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index e870cecc58..0496d38e67 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -3,7 +3,7 @@ use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; use compute_api::responses::{ ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState, - LfcPrewarmState, TlsConfig, + LfcPrewarmState, PromoteState, TlsConfig, }; use compute_api::spec::{ ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent, @@ -29,8 +29,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; -use tokio::task::JoinHandle; -use tokio::{spawn, time}; +use tokio::{spawn, sync::watch, task::JoinHandle, time}; use tracing::{Instrument, debug, error, info, instrument, warn}; use url::Url; use utils::id::{TenantId, TimelineId}; @@ -175,6 +174,7 @@ pub struct ComputeState { /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if /// mode == ComputeMode::Primary. None otherwise pub terminate_flush_lsn: Option, + pub promote_state: Option>, pub metrics: ComputeMetrics, } @@ -192,6 +192,7 @@ impl ComputeState { lfc_prewarm_state: LfcPrewarmState::default(), lfc_offload_state: LfcOffloadState::default(), terminate_flush_lsn: None, + promote_state: None, } } @@ -1057,7 +1058,7 @@ impl ComputeNode { }; let (reader, connected) = tokio::runtime::Handle::current().block_on(async move { - let mut client = page_api::Client::new( + let mut client = page_api::Client::connect( shard0_connstr, spec.tenant_id, spec.timeline_id, @@ -2433,19 +2434,11 @@ LIMIT 100", // If the value is -1, we never suspend so set the value to default collection. // If the value is 0, it means default, we will just continue to use the default. if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 { - info!( - "[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}", - spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL - ); self.params.installed_extensions_collection_interval.store( DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL, std::sync::atomic::Ordering::SeqCst, ); } else { - info!( - "[NEON_EXT_INT_UPD] Spec Timeout: {}", - spec.suspend_timeout_seconds - ); self.params.installed_extensions_collection_interval.store( spec.suspend_timeout_seconds as u64, std::sync::atomic::Ordering::SeqCst, diff --git a/compute_tools/src/compute_promote.rs b/compute_tools/src/compute_promote.rs new file mode 100644 index 0000000000..42256faa22 --- /dev/null +++ b/compute_tools/src/compute_promote.rs @@ -0,0 +1,132 @@ +use crate::compute::ComputeNode; +use anyhow::{Context, Result, bail}; +use compute_api::{ + responses::{LfcPrewarmState, PromoteState, SafekeepersLsn}, + spec::ComputeMode, +}; +use std::{sync::Arc, time::Duration}; +use tokio::time::sleep; +use utils::lsn::Lsn; + +impl ComputeNode { + /// Returns only when promote fails or succeeds. If a network error occurs + /// and http client disconnects, this does not stop promotion, and subsequent + /// calls block until promote finishes. + /// Called by control plane on secondary after primary endpoint is terminated + pub async fn promote(self: &Arc, safekeepers_lsn: SafekeepersLsn) -> PromoteState { + let cloned = self.clone(); + let start_promotion = || { + let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted); + tokio::spawn(async move { + tx.send(match cloned.promote_impl(safekeepers_lsn).await { + Ok(_) => PromoteState::Completed, + Err(err) => { + tracing::error!(%err, "promoting"); + PromoteState::Failed { + error: err.to_string(), + } + } + }) + }); + rx + }; + + let mut task; + // self.state is unlocked after block ends so we lock it in promote_impl + // and task.changed() is reached + { + task = self + .state + .lock() + .unwrap() + .promote_state + .get_or_insert_with(start_promotion) + .clone() + } + task.changed().await.expect("promote sender dropped"); + task.borrow().clone() + } + + // Why do we have to supply safekeepers? + // For secondary we use primary_connection_conninfo so safekeepers field is empty + async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> { + { + let state = self.state.lock().unwrap(); + let mode = &state.pspec.as_ref().unwrap().spec.mode; + if *mode != ComputeMode::Replica { + bail!("{} is not replica", mode.to_type_str()); + } + + // we don't need to query Postgres so not self.lfc_prewarm_state() + match &state.lfc_prewarm_state { + LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => { + bail!("prewarm not requested or pending") + } + LfcPrewarmState::Failed { error } => { + tracing::warn!(%error, "replica prewarm failed") + } + _ => {} + } + } + + let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")?; + + let primary_lsn = safekeepers_lsn.wal_flush_lsn; + let mut last_wal_replay_lsn: Lsn = Lsn::INVALID; + const RETRIES: i32 = 20; + for i in 0..=RETRIES { + let row = client + .query_one("SELECT pg_last_wal_replay_lsn()", &[]) + .await + .context("getting last replay lsn")?; + let lsn: u64 = row.get::(0).into(); + last_wal_replay_lsn = lsn.into(); + if last_wal_replay_lsn >= primary_lsn { + break; + } + tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}"); + sleep(Duration::from_secs(1)).await; + } + if last_wal_replay_lsn < primary_lsn { + bail!("didn't catch up with primary in {RETRIES} retries"); + } + + // using $1 doesn't work with ALTER SYSTEM SET + let safekeepers_sql = format!( + "ALTER SYSTEM SET neon.safekeepers='{}'", + safekeepers_lsn.safekeepers + ); + client + .query(&safekeepers_sql, &[]) + .await + .context("setting safekeepers")?; + client + .query("SELECT pg_reload_conf()", &[]) + .await + .context("reloading postgres config")?; + let row = client + .query_one("SELECT * FROM pg_promote()", &[]) + .await + .context("pg_promote")?; + if !row.get::(0) { + bail!("pg_promote() returned false"); + } + + let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")?; + let row = client + .query_one("SHOW transaction_read_only", &[]) + .await + .context("getting transaction_read_only")?; + if row.get::(0) == "on" { + bail!("replica in read only mode after promotion"); + } + + let mut state = self.state.lock().unwrap(); + state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary; + Ok(()) + } +} diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index bbdb7d0917..3c58b284b3 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -83,6 +83,87 @@ paths: schema: $ref: "#/components/schemas/DbsAndRoles" + /promote: + post: + tags: + - Promotion + summary: Promote secondary replica to primary + description: "" + operationId: promoteReplica + requestBody: + description: Promote requests data + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/SafekeepersLsn" + responses: + 200: + description: Promote succeeded or wasn't started + content: + application/json: + schema: + $ref: "#/components/schemas/PromoteState" + 500: + description: Promote failed + content: + application/json: + schema: + $ref: "#/components/schemas/PromoteState" + + /lfc/prewarm: + post: + summary: Request LFC Prewarm + parameters: + - name: from_endpoint + in: query + schema: + type: string + description: "" + operationId: lfcPrewarm + responses: + 202: + description: LFC prewarm started + 429: + description: LFC prewarm ongoing + get: + tags: + - Prewarm + summary: Get LFC prewarm state + description: "" + operationId: getLfcPrewarmState + responses: + 200: + description: Prewarm state + content: + application/json: + schema: + $ref: "#/components/schemas/LfcPrewarmState" + + /lfc/offload: + post: + summary: Request LFC offload + description: "" + operationId: lfcOffload + responses: + 202: + description: LFC offload started + 429: + description: LFC offload ongoing + get: + tags: + - Prewarm + summary: Get LFC offloading state + description: "" + operationId: getLfcOffloadState + responses: + 200: + description: Offload state + content: + application/json: + schema: + $ref: "#/components/schemas/LfcOffloadState" + /database_schema: get: tags: @@ -335,15 +416,6 @@ components: total_startup_ms: type: integer - Info: - type: object - description: Information about VM/Pod. - required: - - num_cpus - properties: - num_cpus: - type: integer - DbsAndRoles: type: object description: Databases and Roles @@ -497,25 +569,69 @@ components: type: string example: "1.0.0" - InstalledExtensions: + SafekeepersLsn: type: object + required: + - safekeepers + - wal_flush_lsn properties: - extensions: - description: Contains list of installed extensions. - type: array - items: - type: object - properties: - extname: - type: string - version: - type: string - items: - type: string - n_databases: - type: integer - owned_by_superuser: - type: integer + safekeepers: + description: Primary replica safekeepers + type: string + wal_flush_lsn: + description: Primary last WAL flush LSN + type: string + + LfcPrewarmState: + type: object + required: + - status + - total + - prewarmed + - skipped + properties: + status: + description: Lfc prewarm status + enum: [not_prewarmed, prewarming, completed, failed] + type: string + error: + description: Lfc prewarm error, if any + type: string + total: + description: Total pages processed + type: integer + prewarmed: + description: Total pages prewarmed + type: integer + skipped: + description: Pages processed but not prewarmed + type: integer + + LfcOffloadState: + type: object + required: + - status + properties: + status: + description: Lfc offload status + enum: [not_offloaded, offloading, completed, failed] + type: string + error: + description: Lfc offload error, if any + type: string + + PromoteState: + type: object + required: + - status + properties: + status: + description: Promote result + enum: [not_promoted, completed, failed] + type: string + error: + description: Promote error, if any + type: string SetRoleGrantsRequest: type: object diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs index 432e66a830..dd71f663eb 100644 --- a/compute_tools/src/http/routes/mod.rs +++ b/compute_tools/src/http/routes/mod.rs @@ -14,6 +14,7 @@ pub(in crate::http) mod insights; pub(in crate::http) mod lfc; pub(in crate::http) mod metrics; pub(in crate::http) mod metrics_json; +pub(in crate::http) mod promote; pub(in crate::http) mod status; pub(in crate::http) mod terminate; diff --git a/compute_tools/src/http/routes/promote.rs b/compute_tools/src/http/routes/promote.rs new file mode 100644 index 0000000000..bc5f93b4da --- /dev/null +++ b/compute_tools/src/http/routes/promote.rs @@ -0,0 +1,14 @@ +use crate::http::JsonResponse; +use axum::Form; +use http::StatusCode; + +pub(in crate::http) async fn promote( + compute: axum::extract::State>, + Form(safekeepers_lsn): Form, +) -> axum::response::Response { + let state = compute.promote(safekeepers_lsn).await; + if let compute_api::responses::PromoteState::Failed { error } = state { + return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error); + } + JsonResponse::success(StatusCode::OK, state) +} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index d5d2427971..17939e39d4 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -23,7 +23,7 @@ use super::{ middleware::authorize::Authorize, routes::{ check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, - grants, insights, lfc, metrics, metrics_json, status, terminate, + grants, insights, lfc, metrics, metrics_json, promote, status, terminate, }, }; use crate::compute::ComputeNode; @@ -87,6 +87,7 @@ impl From<&Server> for Router> { let authenticated_router = Router::>::new() .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) + .route("/promote", post(promote::promote)) .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) .route("/database_schema", get(database_schema::get_schema_dump)) diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 3899a1ca76..2d5d4565b7 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -12,6 +12,7 @@ pub mod logger; pub mod catalog; pub mod compute; pub mod compute_prewarm; +pub mod compute_promote; pub mod disk_quota; pub mod extension_server; pub mod installed_extensions; diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 3346c18c0d..bb0828429d 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -192,7 +192,7 @@ fn acquire_lsn_lease_grpc( lsn: Lsn, ) -> Result> { tokio::runtime::Handle::current().block_on(async move { - let mut client = page_api::Client::new( + let mut client = page_api::Client::connect( connstring.to_string(), tenant_shard_id.tenant_id, timeline_id, diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql index acb8dd417d..deb7a364af 100644 --- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql +++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql @@ -6,7 +6,7 @@ BEGIN admin_option AS admin INTO monitor FROM pg_auth_members - WHERE roleid = 'pg_monitor'::regrole + WHERE roleid = 'neon_superuser'::regrole AND member = 'pg_monitor'::regrole; IF NOT monitor.member THEN diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index 42431c0066..a7a18743ef 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -13,6 +13,8 @@ use utils::backoff::retry; pub fn app(state: Arc) -> Router<()> { use axum::routing::{delete as _delete, get as _get}; let delete_prefix = _delete(delete_prefix); + // NB: On any changes do not forget to update the OpenAPI spec + // in /endpoint_storage/src/openapi_spec.yml. Router::new() .route( "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}", diff --git a/endpoint_storage/src/openapi_spec.yml b/endpoint_storage/src/openapi_spec.yml new file mode 100644 index 0000000000..8d9abf902c --- /dev/null +++ b/endpoint_storage/src/openapi_spec.yml @@ -0,0 +1,146 @@ +openapi: "3.0.2" +info: + title: Endpoint Storage API + description: Endpoint Storage API + version: "1.0" + license: + name: "Apache" + url: https://github.com/neondatabase/neon/blob/main/LICENSE +servers: + - url: "" +paths: + /status: + description: Healthcheck endpoint + get: + description: Healthcheck + security: [] + responses: + "200": + description: OK + + /{tenant_id}/{timeline_id}/{endpoint_id}/{key}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + - name: endpoint_id + in: path + required: true + schema: + type: string + - name: key + in: path + required: true + schema: + type: string + get: + description: Get file from blob storage + responses: + "200": + description: "File stream from blob storage" + content: + application/octet-stream: + schema: + type: string + format: binary + "400": + description: File was not found + "403": + description: JWT does not authorize request to this route + put: + description: Insert file into blob storage. If file exists, override it + requestBody: + content: + application/octet-stream: + schema: + type: string + format: binary + responses: + "200": + description: File was inserted successfully + "403": + description: JWT does not authorize request to this route + delete: + description: Delete file from blob storage + responses: + "200": + description: File was successfully deleted or not found + "403": + description: JWT does not authorize request to this route + + /{tenant_id}/{timeline_id}/{endpoint_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + - name: endpoint_id + in: path + required: true + schema: + type: string + delete: + description: Delete endpoint data from blob storage + responses: + "200": + description: Endpoint data was deleted + "403": + description: JWT does not authorize request to this route + + /{tenant_id}/{timeline_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + delete: + description: Delete timeline data from blob storage + responses: + "200": + description: Timeline data was deleted + "403": + description: JWT does not authorize request to this route + + /{tenant_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + delete: + description: Delete tenant data from blob storage + responses: + "200": + description: Tenant data was deleted + "403": + description: JWT does not authorize request to this route + +components: + securitySchemes: + JWT: + type: http + scheme: bearer + bearerFormat: JWT + +security: + - JWT: [] diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index a54411b06a..e10c381fb4 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -46,7 +46,7 @@ pub struct ExtensionInstallResponse { pub version: ExtVersion, } -#[derive(Serialize, Default, Debug, Clone)] +#[derive(Serialize, Default, Debug, Clone, PartialEq)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcPrewarmState { #[default] @@ -58,6 +58,17 @@ pub enum LfcPrewarmState { }, } +impl Display for LfcPrewarmState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"), + LfcPrewarmState::Prewarming => f.write_str("Prewarming"), + LfcPrewarmState::Completed => f.write_str("Completed"), + LfcPrewarmState::Failed { error } => write!(f, "Error({error})"), + } + } +} + #[derive(Serialize, Default, Debug, Clone, PartialEq)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcOffloadState { @@ -70,6 +81,23 @@ pub enum LfcOffloadState { }, } +#[derive(Serialize, Debug, Clone, PartialEq)] +#[serde(tag = "status", rename_all = "snake_case")] +/// Response of /promote +pub enum PromoteState { + NotPromoted, + Completed, + Failed { error: String }, +} + +#[derive(Deserialize, Serialize, Default, Debug, Clone)] +#[serde(rename_all = "snake_case")] +/// Result of /safekeepers_lsn +pub struct SafekeepersLsn { + pub safekeepers: String, + pub wal_flush_lsn: utils::lsn::Lsn, +} + /// Response of the /status API #[derive(Serialize, Debug, Deserialize)] #[serde(rename_all = "snake_case")] diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 60311aa3e6..0eeab2bebc 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -442,7 +442,7 @@ pub struct JwksSettings { } /// Protocol used to connect to a Pageserver. Parsed from the connstring scheme. -#[derive(Clone, Copy, Debug, Default)] +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub enum PageserverProtocol { /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme. #[default] diff --git a/libs/http-utils/src/request.rs b/libs/http-utils/src/request.rs index 9024a90a82..afb2ae8f47 100644 --- a/libs/http-utils/src/request.rs +++ b/libs/http-utils/src/request.rs @@ -41,17 +41,35 @@ pub fn get_query_param<'a>( Some(q) => q, None => return Ok(None), }; - let mut values = url::form_urlencoded::parse(query.as_bytes()) + let values = url::form_urlencoded::parse(query.as_bytes()) .filter_map(|(k, v)| if k == param_name { Some(v) } else { None }) // we call .next() twice below. If it's None the first time, .fuse() ensures it's None afterwards .fuse(); - let value1 = values.next(); - if values.next().is_some() { - return Err(ApiError::BadRequest(anyhow!( - "param {param_name} specified more than once" - ))); - } + // Work around an issue with Alloy's pyroscope scrape where the "seconds" + // parameter is added several times. https://github.com/grafana/alloy/issues/3026 + // TODO: revert after Alloy is fixed. + let value1 = values + .map(Ok) + .reduce(|acc, i| { + match acc { + Err(_) => acc, + + // It's okay to have duplicates as along as they have the same value. + Ok(ref a) if a == &i.unwrap() => acc, + + _ => Err(ApiError::BadRequest(anyhow!( + "param {param_name} specified more than once" + ))), + } + }) + .transpose()?; + // if values.next().is_some() { + // return Err(ApiError::BadRequest(anyhow!( + // "param {param_name} specified more than once" + // ))); + // } + Ok(value1) } @@ -92,3 +110,39 @@ pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> None => Ok(()), } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_query_param_duplicate() { + let req = Request::builder() + .uri("http://localhost:12345/testuri?testparam=1") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam").unwrap(); + assert_eq!(value.unwrap(), "1"); + + let req = Request::builder() + .uri("http://localhost:12345/testuri?testparam=1&testparam=1") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam").unwrap(); + assert_eq!(value.unwrap(), "1"); + + let req = Request::builder() + .uri("http://localhost:12345/testuri") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam").unwrap(); + assert!(value.is_none()); + + let req = Request::builder() + .uri("http://localhost:12345/testuri?testparam=1&testparam=2&testparam=3") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam"); + assert!(value.is_err()); + } +} diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 00d6b61399..dc7e9aed7f 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -5,6 +5,7 @@ mod tests; use const_format::formatcp; use posthog_client_lite::PostHogClientConfig; +use utils::serde_percent::Percent; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; @@ -223,7 +224,7 @@ pub struct ConfigToml { pub metric_collection_bucket: Option, #[serde(with = "humantime_serde")] pub synthetic_size_calculation_interval: Duration, - pub disk_usage_based_eviction: Option, + pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, #[serde(with = "humantime_serde")] @@ -273,6 +274,7 @@ pub struct ConfigToml { } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(default)] pub struct DiskUsageEvictionTaskConfig { pub max_usage_pct: utils::serde_percent::Percent, pub min_avail_bytes: u64, @@ -283,6 +285,21 @@ pub struct DiskUsageEvictionTaskConfig { /// Select sorting for evicted layers #[serde(default)] pub eviction_order: EvictionOrder, + pub enabled: bool, +} + +impl Default for DiskUsageEvictionTaskConfig { + fn default() -> Self { + Self { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 2_000_000_000, + period: Duration::from_secs(60), + #[cfg(feature = "testing")] + mock_statvfs: None, + eviction_order: EvictionOrder::default(), + enabled: true, + } + } } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -738,7 +755,7 @@ impl Default for ConfigToml { metric_collection_bucket: (None), - disk_usage_based_eviction: (None), + disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(), test_remote_failures: (0), diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 16545364c1..6735320484 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -384,7 +384,7 @@ pub struct SafekeepersInfo { pub safekeepers: Vec, } -#[derive(Serialize, Deserialize, Clone)] +#[derive(Serialize, Deserialize, Clone, Debug)] pub struct SafekeeperInfo { pub id: NodeId, pub hostname: String, diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 5a13aace64..d6f4cd5e66 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -332,7 +332,11 @@ fn hash_combine(mut a: u32, mut b: u32) -> u32 { /// /// The mapping of key to shard is not stable across changes to ShardCount: this is intentional /// and will be handled at higher levels when shards are split. -fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber { +pub fn key_to_shard_number( + count: ShardCount, + stripe_size: ShardStripeSize, + key: &Key, +) -> ShardNumber { // Fast path for un-sharded tenants or broadcast keys if count < ShardCount(2) || key_is_shard0(key) { return ShardNumber(0); diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index bd18d80915..69316fd493 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -13,6 +13,7 @@ aws-smithy-async.workspace = true aws-smithy-types.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true +base64.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true @@ -41,6 +42,8 @@ http-body-util.workspace = true itertools.workspace = true sync_wrapper = { workspace = true, features = ["futures"] } +byteorder = "1.4" + [dev-dependencies] camino-tempfile.workspace = true test-context.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index e9c24ac723..db30829216 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -14,17 +14,25 @@ use anyhow::{Context, Result, anyhow}; use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions}; use azure_storage::StorageCredentials; -use azure_storage_blobs::blob::operations::GetBlobBuilder; +use azure_storage_blobs::blob::BlobBlockType; +use azure_storage_blobs::blob::BlockList; use azure_storage_blobs::blob::{Blob, CopyStatus}; use azure_storage_blobs::container::operations::ListBlobsBuilder; -use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; +use azure_storage_blobs::prelude::ClientBuilder; +use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; +use base64::{Engine as _, engine::general_purpose::URL_SAFE}; +use byteorder::{BigEndian, ByteOrder}; use bytes::Bytes; +use camino::Utf8Path; use futures::FutureExt; use futures::future::Either; use futures::stream::Stream; use futures_util::{StreamExt, TryStreamExt}; use http_types::{StatusCode, Url}; use scopeguard::ScopeGuard; +use tokio::fs::File; +use tokio::io::AsyncReadExt; +use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; use tracing::debug; use utils::backoff; @@ -51,6 +59,9 @@ pub struct AzureBlobStorage { // Alternative timeout used for metadata objects which are expected to be small pub small_timeout: Duration, + /* BEGIN_HADRON */ + pub put_block_size_mb: Option, + /* END_HADRON */ } impl AzureBlobStorage { @@ -107,6 +118,9 @@ impl AzureBlobStorage { concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), timeout, small_timeout, + /* BEGIN_HADRON */ + put_block_size_mb: azure_config.put_block_size_mb, + /* END_HADRON */ }) } @@ -583,31 +597,137 @@ impl RemoteStorage for AzureBlobStorage { let started_at = start_measuring_requests(kind); - let op = async { + let mut metadata_map = metadata.unwrap_or([].into()); + let timeline_file_path = metadata_map.0.remove("databricks_azure_put_block"); + + /* BEGIN_HADRON */ + let op = async move { let blob_client = self.client.blob_client(self.relative_path_to_name(to)); + let put_block_size = self.put_block_size_mb.unwrap_or(0) * 1024 * 1024; + if timeline_file_path.is_none() || put_block_size == 0 { + // Use put_block_blob directly. + let from: Pin< + Box> + Send + Sync + 'static>, + > = Box::pin(from); + let from = NonSeekableStream::new(from, data_size_bytes); + let body = azure_core::Body::SeekableStream(Box::new(from)); - let from: Pin> + Send + Sync + 'static>> = - Box::pin(from); + let mut builder = blob_client.put_block_blob(body); + if !metadata_map.0.is_empty() { + builder = builder.metadata(to_azure_metadata(metadata_map)); + } + let fut = builder.into_future(); + let fut = tokio::time::timeout(self.timeout, fut); + let result = fut.await; + match result { + Ok(Ok(_response)) => return Ok(()), + Ok(Err(azure)) => return Err(azure.into()), + Err(_timeout) => return Err(TimeoutOrCancel::Timeout.into()), + }; + } + // Upload chunks concurrently using Put Block. + // Each PutBlock uploads put_block_size bytes of the file. + let mut upload_futures: Vec>> = + vec![]; + let mut block_list = BlockList::default(); + let mut start_bytes = 0u64; + let mut remaining_bytes = data_size_bytes; + let mut block_list_count = 0; - let from = NonSeekableStream::new(from, data_size_bytes); + while remaining_bytes > 0 { + let block_size = std::cmp::min(remaining_bytes, put_block_size); + let end_bytes = start_bytes + block_size as u64; + let block_id = block_list_count; + let timeout = self.timeout; + let blob_client = blob_client.clone(); + let timeline_file = timeline_file_path.clone().unwrap().clone(); - let body = azure_core::Body::SeekableStream(Box::new(from)); + let mut encoded_block_id = [0u8; 8]; + BigEndian::write_u64(&mut encoded_block_id, block_id); + URL_SAFE.encode(encoded_block_id); - let mut builder = blob_client.put_block_blob(body); + // Put one block. + let part_fut = async move { + let mut file = File::open(Utf8Path::new(&timeline_file.clone())).await?; + file.seek(io::SeekFrom::Start(start_bytes)).await?; + let limited_reader = file.take(block_size as u64); + let file_chunk_stream = + tokio_util::io::ReaderStream::with_capacity(limited_reader, 1024 * 1024); + let file_chunk_stream_pin: Pin< + Box> + Send + Sync + 'static>, + > = Box::pin(file_chunk_stream); + let stream_wrapper = NonSeekableStream::new(file_chunk_stream_pin, block_size); + let body = azure_core::Body::SeekableStream(Box::new(stream_wrapper)); + // Azure put block takes URL-encoded block ids and all blocks must have the same byte length. + // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block?tabs=microsoft-entra-id#uri-parameters + let builder = blob_client.put_block(encoded_block_id.to_vec(), body); + let fut = builder.into_future(); + let fut = tokio::time::timeout(timeout, fut); + let result = fut.await; + tracing::debug!( + "azure put block id-{} size {} start {} end {} file {} response {:#?}", + block_id, + block_size, + start_bytes, + end_bytes, + timeline_file, + result + ); + match result { + Ok(Ok(_response)) => Ok(()), + Ok(Err(azure)) => Err(azure), + Err(_timeout) => Err(azure_core::Error::new( + azure_core::error::ErrorKind::Io, + std::io::Error::new( + std::io::ErrorKind::TimedOut, + "Operation timed out", + ), + )), + } + }; + upload_futures.push(tokio::spawn(part_fut)); - if let Some(metadata) = metadata { - builder = builder.metadata(to_azure_metadata(metadata)); + block_list_count += 1; + remaining_bytes -= block_size; + start_bytes += block_size as u64; + + block_list + .blocks + .push(BlobBlockType::Uncommitted(encoded_block_id.to_vec().into())); } + tracing::debug!( + "azure put blocks {} total MB: {} chunk size MB: {}", + block_list_count, + data_size_bytes / 1024 / 1024, + put_block_size / 1024 / 1024 + ); + // Wait for all blocks to be uploaded. + let upload_results = futures::future::try_join_all(upload_futures).await; + if upload_results.is_err() { + return Err(anyhow::anyhow!(format!( + "Failed to upload all blocks {:#?}", + upload_results.unwrap_err() + ))); + } + + // Commit the blocks. + let mut builder = blob_client.put_block_list(block_list); + if !metadata_map.0.is_empty() { + builder = builder.metadata(to_azure_metadata(metadata_map)); + } let fut = builder.into_future(); let fut = tokio::time::timeout(self.timeout, fut); + let result = fut.await; + tracing::debug!("azure put block list response {:#?}", result); - match fut.await { + match result { Ok(Ok(_response)) => Ok(()), Ok(Err(azure)) => Err(azure.into()), Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), } }; + /* END_HADRON */ let res = tokio::select! { res = op => res, @@ -622,7 +742,6 @@ impl RemoteStorage for AzureBlobStorage { crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, outcome, started_at); - res } diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index 5bc1f678ae..e13e17d544 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -195,8 +195,19 @@ pub struct AzureConfig { pub max_keys_per_list_response: Option, #[serde(default = "default_azure_conn_pool_size")] pub conn_pool_size: usize, + /* BEGIN_HADRON */ + #[serde(default = "default_azure_put_block_size_mb")] + pub put_block_size_mb: Option, + /* END_HADRON */ } +/* BEGIN_HADRON */ +fn default_azure_put_block_size_mb() -> Option { + // Disable parallel upload by default. + Some(0) +} +/* END_HADRON */ + fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize { NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap() } @@ -213,6 +224,9 @@ impl Debug for AzureConfig { "max_keys_per_list_response", &self.max_keys_per_list_response, ) + /* BEGIN_HADRON */ + .field("put_block_size_mb", &self.put_block_size_mb) + /* END_HADRON */ .finish() } } @@ -352,6 +366,7 @@ timeout = '5s'"; upload_storage_class = 'INTELLIGENT_TIERING' timeout = '7s' conn_pool_size = 8 + put_block_size_mb = 1024 "; let config = parse(toml).unwrap(); @@ -367,6 +382,9 @@ timeout = '5s'"; concurrency_limit: default_remote_storage_azure_concurrency_limit(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, conn_pool_size: 8, + /* BEGIN_HADRON */ + put_block_size_mb: Some(1024), + /* END_HADRON */ }), timeout: Duration::from_secs(7), small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs index daab05d91a..fb7d6fd482 100644 --- a/libs/remote_storage/tests/common/mod.rs +++ b/libs/remote_storage/tests/common/mod.rs @@ -165,10 +165,42 @@ pub(crate) async fn upload_remote_data( let (data, data_len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); + + /* BEGIN_HADRON */ + let mut metadata = None; + if matches!(&*task_client, GenericRemoteStorage::AzureBlob(_)) { + let file_path = "/tmp/dbx_upload_tmp_file.txt"; + { + // Open the file in append mode + let mut file = std::fs::OpenOptions::new() + .append(true) + .create(true) // Create the file if it doesn't exist + .open(file_path)?; + // Append some bytes to the file + std::io::Write::write_all( + &mut file, + &format!("remote blob data {i}").into_bytes(), + )?; + file.sync_all()?; + } + metadata = Some(remote_storage::StorageMetadata::from([( + "databricks_azure_put_block", + file_path, + )])); + } + /* END_HADRON */ + task_client - .upload(data, data_len, &blob_path, None, &cancel) + .upload(data, data_len, &blob_path, metadata, &cancel) .await?; + // TODO: Check upload is using the put_block upload. + // We cannot consume data here since data is moved inside the upload. + // let total_bytes = data.fold(0, |acc, chunk| async move { + // acc + chunk.map(|bytes| bytes.len()).unwrap_or(0) + // }).await; + // assert_eq!(total_bytes, data_len); + Ok::<_, anyhow::Error>((blob_prefix, blob_path)) }); } diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 31c9ca3200..4d7caabd39 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -219,6 +219,9 @@ async fn create_azure_client( concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, conn_pool_size: 8, + /* BEGIN_HADRON */ + put_block_size_mb: Some(1), + /* END_HADRON */ }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index f2b81373e2..5a0edf8cea 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -171,6 +171,12 @@ impl std::fmt::Display for ShardNumber { } } +impl std::fmt::Display for ShardCount { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + impl std::fmt::Display for ShardSlug<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 8a2e2ed3be..1fd0dccff0 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -112,6 +112,7 @@ twox-hash.workspace = true procfs.workspace = true [dev-dependencies] +base64.workspace = true criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml new file mode 100644 index 0000000000..84e27abb84 --- /dev/null +++ b/pageserver/client_grpc/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "pageserver_client_grpc" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +bytes.workspace = true +compute_api.workspace = true +futures.workspace = true +pageserver_api.workspace = true +pageserver_page_api.workspace = true +tokio.workspace = true +tokio-stream.workspace = true +tonic.workspace = true +tracing.workspace = true +utils.workspace = true +workspace_hack.workspace = true diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs new file mode 100644 index 0000000000..63852868c3 --- /dev/null +++ b/pageserver/client_grpc/src/client.rs @@ -0,0 +1,403 @@ +use std::collections::HashMap; +use std::num::NonZero; +use std::sync::Arc; + +use anyhow::anyhow; +use futures::stream::FuturesUnordered; +use futures::{FutureExt as _, StreamExt as _}; +use tracing::instrument; + +use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool}; +use crate::retry::Retry; +use crate::split::GetPageSplitter; +use compute_api::spec::PageserverProtocol; +use pageserver_api::shard::ShardStripeSize; +use pageserver_page_api as page_api; +use utils::id::{TenantId, TimelineId}; +use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + +/// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up +/// when full. +/// +/// TODO: tune all of these constants, and consider making them configurable. +/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels +/// with only streams. +const MAX_CLIENTS_PER_CHANNEL: NonZero = NonZero::new(16).unwrap(); + +/// Max number of concurrent unary request clients per shard. +const MAX_UNARY_CLIENTS: NonZero = NonZero::new(64).unwrap(); + +/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage +/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`. +const MAX_STREAMS: NonZero = NonZero::new(64).unwrap(); + +/// Max number of pipelined requests per stream. +const MAX_STREAM_QUEUE_DEPTH: NonZero = NonZero::new(2).unwrap(); + +/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these +/// are more throughput-oriented, we have a smaller limit but higher queue depth. +const MAX_BULK_STREAMS: NonZero = NonZero::new(16).unwrap(); + +/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus +/// get a larger queue depth. +const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero = NonZero::new(4).unwrap(); + +/// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the +/// basic `page_api::Client` gRPC client, and supports: +/// +/// * Sharded tenants across multiple Pageservers. +/// * Pooling of connections, clients, and streams for efficient resource use. +/// * Concurrent use by many callers. +/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling. +/// * Automatic retries. +/// * Observability. +/// +/// TODO: this client does not support base backups or LSN leases, as these are only used by +/// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards. +pub struct PageserverClient { + // TODO: support swapping out the shard map, e.g. via an ArcSwap. + shards: Shards, + retry: Retry, +} + +impl PageserverClient { + /// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given + /// in the shard map, which must be complete and must use gRPC URLs. + pub fn new( + tenant_id: TenantId, + timeline_id: TimelineId, + shard_map: HashMap, + stripe_size: ShardStripeSize, + auth_token: Option, + ) -> anyhow::Result { + let shards = Shards::new(tenant_id, timeline_id, shard_map, stripe_size, auth_token)?; + Ok(Self { + shards, + retry: Retry, + }) + } + + /// Returns whether a relation exists. + #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))] + pub async fn check_rel_exists( + &self, + req: page_api::CheckRelExistsRequest, + ) -> tonic::Result { + self.retry + .with(async || { + // Relation metadata is only available on shard 0. + let mut client = self.shards.get_zero().client().await?; + client.check_rel_exists(req).await + }) + .await + } + + /// Returns the total size of a database, as # of bytes. + #[instrument(skip_all, fields(db_oid=%req.db_oid, lsn=%req.read_lsn))] + pub async fn get_db_size( + &self, + req: page_api::GetDbSizeRequest, + ) -> tonic::Result { + self.retry + .with(async || { + // Relation metadata is only available on shard 0. + let mut client = self.shards.get_zero().client().await?; + client.get_db_size(req).await + }) + .await + } + + /// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically + /// splits requests that straddle shard boundaries, and assembles the responses. + /// + /// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status` + /// errors. All responses will have `GetPageStatusCode::Ok`. + #[instrument(skip_all, fields( + req_id = %req.request_id, + class = %req.request_class, + rel = %req.rel, + blkno = %req.block_numbers[0], + blks = %req.block_numbers.len(), + lsn = %req.read_lsn, + ))] + pub async fn get_page( + &self, + req: page_api::GetPageRequest, + ) -> tonic::Result { + // Make sure we have at least one page. + if req.block_numbers.is_empty() { + return Err(tonic::Status::invalid_argument("no block number")); + } + + // Fast path: request is for a single shard. + if let Some(shard_id) = + GetPageSplitter::is_single_shard(&req, self.shards.count, self.shards.stripe_size) + { + return self.get_page_for_shard(shard_id, req).await; + } + + // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and + // reassemble the responses. + // + // TODO: when we support shard map updates, we need to detect when it changes and re-split + // the request on errors. + let mut splitter = GetPageSplitter::split(req, self.shards.count, self.shards.stripe_size); + + let mut shard_requests: FuturesUnordered<_> = splitter + .drain_requests() + .map(|(shard_id, shard_req)| { + // NB: each request will retry internally. + self.get_page_for_shard(shard_id, shard_req) + .map(move |result| result.map(|resp| (shard_id, resp))) + }) + .collect(); + + while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? { + splitter.add_response(shard_id, shard_response)?; + } + + splitter.assemble_response() + } + + /// Fetches pages that belong to the given shard. + #[instrument(skip_all, fields(shard = %shard_id))] + async fn get_page_for_shard( + &self, + shard_id: ShardIndex, + req: page_api::GetPageRequest, + ) -> tonic::Result { + let resp = self + .retry + .with(async || { + let stream = self + .shards + .get(shard_id)? + .stream(req.request_class.is_bulk()) + .await; + let resp = stream.send(req.clone()).await?; + + // Convert per-request errors into a tonic::Status. + if resp.status_code != page_api::GetPageStatusCode::Ok { + return Err(tonic::Status::new( + resp.status_code.into(), + resp.reason.unwrap_or_else(|| String::from("unknown error")), + )); + } + + Ok(resp) + }) + .await?; + + // Make sure we got the right number of pages. + // NB: check outside of the retry loop, since we don't want to retry this. + let (expected, actual) = (req.block_numbers.len(), resp.page_images.len()); + if expected != actual { + return Err(tonic::Status::internal(format!( + "expected {expected} pages for shard {shard_id}, got {actual}", + ))); + } + + Ok(resp) + } + + /// Returns the size of a relation, as # of blocks. + #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))] + pub async fn get_rel_size( + &self, + req: page_api::GetRelSizeRequest, + ) -> tonic::Result { + self.retry + .with(async || { + // Relation metadata is only available on shard 0. + let mut client = self.shards.get_zero().client().await?; + client.get_rel_size(req).await + }) + .await + } + + /// Fetches an SLRU segment. + #[instrument(skip_all, fields(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn))] + pub async fn get_slru_segment( + &self, + req: page_api::GetSlruSegmentRequest, + ) -> tonic::Result { + self.retry + .with(async || { + // SLRU segments are only available on shard 0. + let mut client = self.shards.get_zero().client().await?; + client.get_slru_segment(req).await + }) + .await + } +} + +/// Tracks the tenant's shards. +struct Shards { + /// The shard count. + /// + /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention. + count: ShardCount, + /// The stripe size. Only used for sharded tenants. + stripe_size: ShardStripeSize, + /// Shards by shard index. + /// + /// NB: unsharded tenants use count 0, like `ShardIndex::unsharded()`. + /// + /// INVARIANT: every shard 0..count is present. + /// INVARIANT: shard 0 is always present. + map: HashMap, +} + +impl Shards { + /// Creates a new set of shards based on a shard map. + fn new( + tenant_id: TenantId, + timeline_id: TimelineId, + shard_map: HashMap, + stripe_size: ShardStripeSize, + auth_token: Option, + ) -> anyhow::Result { + let count = match shard_map.len() { + 0 => return Err(anyhow!("no shards provided")), + 1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()` + n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")), + n => ShardCount::new(n as u8), + }; + + let mut map = HashMap::new(); + for (shard_id, url) in shard_map { + // The shard index must match the computed shard count, even for unsharded tenants. + if shard_id.shard_count != count { + return Err(anyhow!("invalid shard index {shard_id}, expected {count}")); + } + // The shard index' number and count must be consistent. + if !shard_id.is_unsharded() && shard_id.shard_number.0 >= shard_id.shard_count.0 { + return Err(anyhow!("invalid shard index {shard_id}")); + } + // The above conditions guarantee that we have all shards 0..count: len() matches count, + // shard number < count, and numbers are unique (via hashmap). + let shard = Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?; + map.insert(shard_id, shard); + } + + Ok(Self { + count, + stripe_size, + map, + }) + } + + /// Looks up the given shard. + #[allow(clippy::result_large_err)] // TODO: check perf impact + fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> { + self.map + .get(&shard_id) + .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}"))) + } + + /// Returns shard 0. + fn get_zero(&self) -> &Shard { + self.get(ShardIndex::new(ShardNumber(0), self.count)) + .expect("always present") + } +} + +/// A single shard. Uses dedicated resource pools with the following structure: +/// +/// * Channel pool: unbounded. +/// * Unary client pool: MAX_UNARY_CLIENTS. +/// * Stream client pool: unbounded. +/// * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH. +/// * Bulk channel pool: unbounded. +/// * Bulk client pool: unbounded. +/// * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH. +struct Shard { + /// Unary gRPC client pool. + client_pool: Arc, + /// GetPage stream pool. + stream_pool: Arc, + /// GetPage stream pool for bulk requests, e.g. prefetches. + bulk_stream_pool: Arc, +} + +impl Shard { + /// Creates a new shard. It has its own dedicated resource pools. + fn new( + url: String, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_token: Option, + ) -> anyhow::Result { + // Sanity-check that the URL uses gRPC. + if PageserverProtocol::from_connstring(&url)? != PageserverProtocol::Grpc { + return Err(anyhow!("invalid shard URL {url}: must use gRPC")); + } + + // Common channel pool for unary and stream requests. Bounded by client/stream pools. + let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?; + + // Client pool for unary requests. + let client_pool = ClientPool::new( + channel_pool.clone(), + tenant_id, + timeline_id, + shard_id, + auth_token.clone(), + Some(MAX_UNARY_CLIENTS), + ); + + // GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients, + // but shares a channel pool with it (as it's unbounded). + let stream_pool = StreamPool::new( + ClientPool::new( + channel_pool.clone(), + tenant_id, + timeline_id, + shard_id, + auth_token.clone(), + None, // unbounded, limited by stream pool + ), + Some(MAX_STREAMS), + MAX_STREAM_QUEUE_DEPTH, + ); + + // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools + // to avoid head-of-line blocking of latency-sensitive requests. + let bulk_stream_pool = StreamPool::new( + ClientPool::new( + ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?, + tenant_id, + timeline_id, + shard_id, + auth_token, + None, // unbounded, limited by stream pool + ), + Some(MAX_BULK_STREAMS), + MAX_BULK_STREAM_QUEUE_DEPTH, + ); + + Ok(Self { + client_pool, + stream_pool, + bulk_stream_pool, + }) + } + + /// Returns a pooled client for this shard. + async fn client(&self) -> tonic::Result { + self.client_pool + .get() + .await + .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}"))) + } + + /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream + /// pool (e.g. for prefetches). + async fn stream(&self, bulk: bool) -> StreamGuard { + match bulk { + false => self.stream_pool.get().await, + true => self.bulk_stream_pool.get().await, + } + } +} diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs new file mode 100644 index 0000000000..3fc7178be2 --- /dev/null +++ b/pageserver/client_grpc/src/lib.rs @@ -0,0 +1,6 @@ +mod client; +mod pool; +mod retry; +mod split; + +pub use client::PageserverClient; diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs new file mode 100644 index 0000000000..5a50004fd1 --- /dev/null +++ b/pageserver/client_grpc/src/pool.rs @@ -0,0 +1,598 @@ +//! This module provides various Pageserver gRPC client resource pools. +//! +//! These pools are designed to reuse gRPC resources (connections, clients, and streams) across +//! multiple concurrent callers (i.e. Postgres backends). This avoids the resource cost and latency +//! of creating dedicated TCP connections and server tasks for every Postgres backend. +//! +//! Each resource has its own, nested pool. The pools are custom-built for the properties of each +//! resource -- they are different enough that a generic pool isn't suitable. +//! +//! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients +//! can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a +//! per-channel client limit. Channels may be closed when they are no longer used by any clients. +//! +//! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared) +//! channel from the ChannelPool for the client's lifetime. A client can only be acquired by a +//! single caller at a time, and is returned to the pool when dropped. Idle clients may be removed +//! from the pool after some time, to free up the channel. +//! +//! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the +//! ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it +//! returns a guard that can be used to send a single request, to properly enforce queue depth and +//! route responses. Internally, the pool will reuse or spin up a suitable stream for the request, +//! possibly pipelining multiple requests from multiple callers on the same stream (up to some +//! queue depth). Idle streams may be removed from the pool after a while to free up the client. +//! +//! Each channel corresponds to one TCP connection. Each client unary request and each stream +//! corresponds to one HTTP/2 stream and server task. +//! +//! TODO: error handling (including custom error types). +//! TODO: observability. + +use std::collections::{BTreeMap, HashMap}; +use std::num::NonZero; +use std::ops::{Deref, DerefMut}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex, Weak}; + +use futures::StreamExt as _; +use tokio::sync::mpsc::{Receiver, Sender}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot}; +use tonic::transport::{Channel, Endpoint}; +use tracing::{error, warn}; + +use pageserver_page_api as page_api; +use utils::id::{TenantId, TimelineId}; +use utils::shard::ShardIndex; + +/// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2 +/// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this. +/// The pool does not limit the number of channels, and instead relies on `ClientPool` or +/// `StreamPool` to limit the number of concurrent clients. +/// +/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. +/// +/// TODO: reap idle channels. +/// TODO: consider prewarming a set of channels, to avoid initial connection latency. +/// TODO: consider adding a circuit breaker for errors and fail fast. +pub struct ChannelPool { + /// Pageserver endpoint to connect to. + endpoint: Endpoint, + /// Max number of clients per channel. Beyond this, a new channel will be created. + max_clients_per_channel: NonZero, + /// Open channels. + channels: Mutex>, + /// Channel ID generator. + next_channel_id: AtomicUsize, +} + +type ChannelID = usize; + +struct ChannelEntry { + /// The gRPC channel (i.e. TCP connection). Shared by multiple clients. + channel: Channel, + /// Number of clients using this channel. + clients: usize, +} + +impl ChannelPool { + /// Creates a new channel pool for the given Pageserver endpoint. + pub fn new(endpoint: E, max_clients_per_channel: NonZero) -> anyhow::Result> + where + E: TryInto + Send + Sync + 'static, + >::Error: std::error::Error + Send + Sync, + { + Ok(Arc::new(Self { + endpoint: endpoint.try_into()?, + max_clients_per_channel, + channels: Mutex::default(), + next_channel_id: AtomicUsize::default(), + })) + } + + /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel. + /// + /// This never blocks (except for mutex acquisition). The channel is connected lazily on first + /// use, and the `ChannelPool` does not have a channel limit. Channels will be re-established + /// automatically on failure (TODO: verify). + /// + /// Callers should not clone the returned channel, and must hold onto the returned guard as long + /// as the channel is in use. It is unfortunately not possible to enforce this: the Protobuf + /// client requires an owned `Channel` and we don't have access to the channel's internal + /// refcount. + /// + /// This is not performance-sensitive. It is only called when creating a new client, and clients + /// are pooled and reused by `ClientPool`. The total number of channels will also be small. O(n) + /// performance is therefore okay. + pub fn get(self: &Arc) -> ChannelGuard { + let mut channels = self.channels.lock().unwrap(); + + // Try to find an existing channel with available capacity. We check entries in BTreeMap + // order, to fill up the lower-ordered channels first. The ClientPool also prefers clients + // with lower-ordered channel IDs first. This will cluster clients in lower-ordered + // channels, and free up higher-ordered channels such that they can be reaped. + for (&id, entry) in channels.iter_mut() { + assert!( + entry.clients <= self.max_clients_per_channel.get(), + "channel overflow" + ); + if entry.clients < self.max_clients_per_channel.get() { + entry.clients += 1; + return ChannelGuard { + pool: Arc::downgrade(self), + id, + channel: Some(entry.channel.clone()), + }; + } + } + + // Create a new channel. We connect lazily on first use, such that we don't block here and + // other clients can join onto the same channel while it's connecting. + let channel = self.endpoint.connect_lazy(); + + let id = self.next_channel_id.fetch_add(1, Ordering::Relaxed); + let entry = ChannelEntry { + channel: channel.clone(), + clients: 1, // account for the guard below + }; + channels.insert(id, entry); + + ChannelGuard { + pool: Arc::downgrade(self), + id, + channel: Some(channel), + } + } +} + +/// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`, +/// since the gRPC client requires an owned `Channel`. +pub struct ChannelGuard { + pool: Weak, + id: ChannelID, + channel: Option, +} + +impl ChannelGuard { + /// Returns the inner owned channel. Panics if called more than once. The caller must hold onto + /// the guard as long as the channel is in use, and should not clone it. + pub fn take(&mut self) -> Channel { + self.channel.take().expect("channel already taken") + } +} + +/// Returns the channel to the pool. +impl Drop for ChannelGuard { + fn drop(&mut self) { + let Some(pool) = self.pool.upgrade() else { + return; // pool was dropped + }; + let mut channels = pool.channels.lock().unwrap(); + let entry = channels.get_mut(&self.id).expect("unknown channel"); + assert!(entry.clients > 0, "channel underflow"); + entry.clients -= 1; + } +} + +/// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner +/// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total +/// number of concurrent clients to `max_clients` via semaphore. +/// +/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. +/// +/// TODO: reap idle clients. +pub struct ClientPool { + /// Tenant ID. + tenant_id: TenantId, + /// Timeline ID. + timeline_id: TimelineId, + /// Shard ID. + shard_id: ShardIndex, + /// Authentication token, if any. + auth_token: Option, + /// Channel pool to acquire channels from. + channel_pool: Arc, + /// Limits the max number of concurrent clients for this pool. None if the pool is unbounded. + limiter: Option>, + /// Idle pooled clients. Acquired clients are removed from here and returned on drop. + /// + /// The first client in the map will be acquired next. The map is sorted by client ID, which in + /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from + /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle + /// clients are reaped. + idle: Mutex>, + /// Unique client ID generator. + next_client_id: AtomicUsize, +} + +type ClientID = (ChannelID, usize); + +struct ClientEntry { + /// The pooled gRPC client. + client: page_api::Client, + /// The channel guard for the channel used by the client. + channel_guard: ChannelGuard, +} + +impl ClientPool { + /// Creates a new client pool for the given tenant shard. Channels are acquired from the given + /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. Allows up to + /// `max_clients` concurrent clients, or unbounded if None. + pub fn new( + channel_pool: Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_token: Option, + max_clients: Option>, + ) -> Arc { + Arc::new(Self { + tenant_id, + timeline_id, + shard_id, + auth_token, + channel_pool, + idle: Mutex::default(), + limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))), + next_client_id: AtomicUsize::default(), + }) + } + + /// Gets a client from the pool, or creates a new one if necessary. Connections are established + /// lazily and do not block, but this call can block if the pool is at `max_clients`. The client + /// is returned to the pool when the guard is dropped. + /// + /// This is moderately performance-sensitive. It is called for every unary request, but these + /// establish a new gRPC stream per request so they're already expensive. GetPage requests use + /// the `StreamPool` instead. + pub async fn get(self: &Arc) -> anyhow::Result { + // Acquire a permit if the pool is bounded. + let mut permit = None; + if let Some(limiter) = self.limiter.clone() { + permit = Some(limiter.acquire_owned().await.expect("never closed")); + } + + // Fast path: acquire an idle client from the pool. + if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() { + return Ok(ClientGuard { + pool: Arc::downgrade(self), + id, + client: Some(entry.client), + channel_guard: Some(entry.channel_guard), + permit, + }); + } + + // Slow path: construct a new client. + let mut channel_guard = self.channel_pool.get(); + let client = page_api::Client::new( + channel_guard.take(), + self.tenant_id, + self.timeline_id, + self.shard_id, + self.auth_token.clone(), + None, + )?; + + Ok(ClientGuard { + pool: Arc::downgrade(self), + id: ( + channel_guard.id, + self.next_client_id.fetch_add(1, Ordering::Relaxed), + ), + client: Some(client), + channel_guard: Some(channel_guard), + permit, + }) + } +} + +/// A client acquired from the pool. The inner client can be accessed via Deref. The client is +/// returned to the pool when dropped. +pub struct ClientGuard { + pool: Weak, + id: ClientID, + client: Option, // Some until dropped + channel_guard: Option, // Some until dropped + permit: Option, // None if pool is unbounded +} + +impl Deref for ClientGuard { + type Target = page_api::Client; + + fn deref(&self) -> &Self::Target { + self.client.as_ref().expect("not dropped") + } +} + +impl DerefMut for ClientGuard { + fn deref_mut(&mut self) -> &mut Self::Target { + self.client.as_mut().expect("not dropped") + } +} + +/// Returns the client to the pool. +impl Drop for ClientGuard { + fn drop(&mut self) { + let Some(pool) = self.pool.upgrade() else { + return; // pool was dropped + }; + let entry = ClientEntry { + client: self.client.take().expect("dropped once"), + channel_guard: self.channel_guard.take().expect("dropped once"), + }; + pool.idle.lock().unwrap().insert(self.id, entry); + + _ = self.permit; // returned on drop, referenced for visibility + } +} + +/// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream +/// acquires a client from the inner `ClientPool` for the stream's lifetime. +/// +/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send +/// a single request and await the response. Internally, requests are multiplexed across streams and +/// channels. This allows proper queue depth enforcement and response routing. +/// +/// TODO: reap idle streams. +/// TODO: consider making this generic over request and response types; not currently needed. +pub struct StreamPool { + /// The client pool to acquire clients from. Must be unbounded. + client_pool: Arc, + /// All pooled streams. + /// + /// Incoming requests will be sent over an existing stream with available capacity. If all + /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each + /// stream has an associated Tokio task that processes requests and responses. + streams: Arc>>, + /// The max number of concurrent streams, or None if unbounded. + max_streams: Option>, + /// The max number of concurrent requests per stream. + max_queue_depth: NonZero, + /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`. + /// None if the pool is unbounded. + limiter: Option>, + /// Stream ID generator. + next_stream_id: AtomicUsize, +} + +type StreamID = usize; +type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>; +type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>; +type ResponseSender = oneshot::Sender>; + +struct StreamEntry { + /// Sends caller requests to the stream task. The stream task exits when this is dropped. + sender: RequestSender, + /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on + /// completion without acquiring the `StreamPool::streams` lock. + queue_depth: Arc, +} + +impl StreamPool { + /// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth` + /// concurrent requests on each stream, and use up to `max_streams` concurrent streams. + /// + /// The client pool must be unbounded. The stream pool will enforce its own limits, and because + /// streams are long-lived they can cause persistent starvation if they exhaust the client pool. + /// The stream pool should generally have its own dedicated client pool (but it can share a + /// channel pool with others since these are always unbounded). + pub fn new( + client_pool: Arc, + max_streams: Option>, + max_queue_depth: NonZero, + ) -> Arc { + assert!(client_pool.limiter.is_none(), "bounded client pool"); + Arc::new(Self { + client_pool, + streams: Arc::default(), + limiter: max_streams.map(|max_streams| { + Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get())) + }), + max_streams, + max_queue_depth, + next_stream_id: AtomicUsize::default(), + }) + } + + /// Acquires an available stream from the pool, or spins up a new stream async if all streams + /// are full. Returns a guard that can be used to send a single request on the stream and await + /// the response, with queue depth quota already acquired. Blocks if the pool is at capacity + /// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight). + /// + /// This is very performance-sensitive, as it is on the GetPage hot path. + /// + /// TODO: this must do something more sophisticated for performance. We want: + /// + /// * Cheap, concurrent access in the common case where we can use a pooled stream. + /// * Quick acquisition of pooled streams with available capacity. + /// * Prefer streams that belong to lower-numbered channels, to reap idle channels. + /// * Prefer filling up existing streams' queue depth before spinning up new streams. + /// * Don't hold a lock while spinning up new streams. + /// * Allow concurrent clients to join onto streams while they're spun up. + /// * Allow spinning up multiple streams concurrently, but don't overshoot limits. + /// + /// For now, we just do something simple and functional, but very inefficient (linear scan). + pub async fn get(&self) -> StreamGuard { + // Acquire a permit if the pool is bounded. + let mut permit = None; + if let Some(limiter) = self.limiter.clone() { + permit = Some(limiter.acquire_owned().await.expect("never closed")); + } + let mut streams = self.streams.lock().unwrap(); + + // Look for a pooled stream with available capacity. + for entry in streams.values() { + assert!( + entry.queue_depth.load(Ordering::Relaxed) <= self.max_queue_depth.get(), + "stream queue overflow" + ); + if entry + .queue_depth + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| { + // Increment the queue depth via compare-and-swap. + // TODO: review ordering. + (queue_depth < self.max_queue_depth.get()).then_some(queue_depth + 1) + }) + .is_ok() + { + return StreamGuard { + sender: entry.sender.clone(), + queue_depth: entry.queue_depth.clone(), + permit, + }; + } + } + + // No available stream, spin up a new one. We install the stream entry in the pool first and + // return the guard, while spinning up the stream task async. This allows other callers to + // join onto this stream and also create additional streams concurrently if this fills up. + let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed); + let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller + let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get()); + let entry = StreamEntry { + sender: req_tx.clone(), + queue_depth: queue_depth.clone(), + }; + streams.insert(id, entry); + + if let Some(max_streams) = self.max_streams { + assert!(streams.len() <= max_streams.get(), "stream overflow"); + }; + + let client_pool = self.client_pool.clone(); + let streams = self.streams.clone(); + + tokio::spawn(async move { + if let Err(err) = Self::run_stream(client_pool, req_rx).await { + error!("stream failed: {err}"); + } + // Remove stream from pool on exit. + let entry = streams.lock().unwrap().remove(&id); + assert!(entry.is_some(), "unknown stream ID: {id}"); + }); + + StreamGuard { + sender: req_tx, + queue_depth, + permit, + } + } + + /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a + /// bidirectional GetPage stream, then forwards requests and responses between callers and the + /// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be + /// atomic with pool stream acquisition. + /// + /// The task exits when the request channel is closed, or on a stream error. The caller is + /// responsible for removing the stream from the pool on exit. + async fn run_stream( + client_pool: Arc, + mut caller_rx: RequestReceiver, + ) -> anyhow::Result<()> { + // Acquire a client from the pool and create a stream. + let mut client = client_pool.get().await?; + + // NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could + // theoretically deadlock if both the client and server block on sends (since we're not + // reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and + // low queue depths, but it was seen to happen with the libpq protocol so better safe than + // sorry. It should never buffer more than the queue depth anyway, but using an unbounded + // channel guarantees that it will never block. + let (req_tx, req_rx) = mpsc::unbounded_channel(); + let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx); + let mut resp_stream = client.get_pages(req_stream).await?; + + // Track caller response channels by request ID. If the task returns early, these response + // channels will be dropped and the waiting callers will receive an error. + let mut callers = HashMap::new(); + + // Process requests and responses. + loop { + tokio::select! { + // Receive requests from callers and send them to the stream. + req = caller_rx.recv() => { + // Shut down if request channel is closed. + let Some((req, resp_tx)) = req else { + return Ok(()); + }; + + // Store the response channel by request ID. + if callers.contains_key(&req.request_id) { + // Error on request ID duplicates. Ignore callers that went away. + _ = resp_tx.send(Err(tonic::Status::invalid_argument( + format!("duplicate request ID: {}", req.request_id), + ))); + continue; + } + callers.insert(req.request_id, resp_tx); + + // Send the request on the stream. Bail out if the stream is closed. + req_tx.send(req).map_err(|_| { + tonic::Status::unavailable("stream closed") + })?; + } + + // Receive responses from the stream and send them to callers. + resp = resp_stream.next() => { + // Shut down if the stream is closed, and bail out on stream errors. + let Some(resp) = resp.transpose()? else { + return Ok(()) + }; + + // Send the response to the caller. Ignore errors if the caller went away. + let Some(resp_tx) = callers.remove(&resp.request_id) else { + warn!("received response for unknown request ID: {}", resp.request_id); + continue; + }; + _ = resp_tx.send(Ok(resp)); + } + } + } + } +} + +/// A pooled stream reference. Can be used to send a single request, to properly enforce queue +/// depth. Queue depth is already reserved and will be returned on drop. +pub struct StreamGuard { + sender: RequestSender, + queue_depth: Arc, + permit: Option, // None if pool is unbounded +} + +impl StreamGuard { + /// Sends a request on the stream and awaits the response. Consumes the guard, since it's only + /// valid for a single request (to enforce queue depth). This also drops the guard on return and + /// returns the queue depth quota to the pool. + /// + /// The `GetPageRequest::request_id` must be unique across in-flight requests. + /// + /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status` + /// to avoid tearing down the stream for per-request errors. Callers must check this. + pub async fn send( + self, + req: page_api::GetPageRequest, + ) -> tonic::Result { + let (resp_tx, resp_rx) = oneshot::channel(); + + self.sender + .send((req, resp_tx)) + .await + .map_err(|_| tonic::Status::unavailable("stream closed"))?; + + resp_rx + .await + .map_err(|_| tonic::Status::unavailable("stream closed"))? + } +} + +impl Drop for StreamGuard { + fn drop(&mut self) { + // Release the queue depth reservation on drop. This can prematurely decrement it if dropped + // before the response is received, but that's okay. + let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst); + assert!(prev_queue_depth > 0, "stream queue underflow"); + + _ = self.permit; // returned on drop, referenced for visibility + } +} diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs new file mode 100644 index 0000000000..b0473204d7 --- /dev/null +++ b/pageserver/client_grpc/src/retry.rs @@ -0,0 +1,151 @@ +use std::time::Duration; + +use tokio::time::Instant; +use tracing::{error, info, warn}; + +use utils::backoff::exponential_backoff_duration; + +/// A retry handler for Pageserver gRPC requests. +/// +/// This is used instead of backoff::retry for better control and observability. +pub struct Retry; + +impl Retry { + /// The per-request timeout. + // TODO: tune these, and/or make them configurable. Should we retry forever? + const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); + /// The total timeout across all attempts + const TOTAL_TIMEOUT: Duration = Duration::from_secs(60); + /// The initial backoff duration. + const BASE_BACKOFF: Duration = Duration::from_millis(10); + /// The maximum backoff duration. + const MAX_BACKOFF: Duration = Duration::from_secs(10); + /// If true, log successful requests. For debugging. + const LOG_SUCCESS: bool = false; + + /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors, + /// using the current tracing span for context. + /// + /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default + /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`]. + pub async fn with(&self, mut f: F) -> tonic::Result + where + F: FnMut() -> O, + O: Future>, + { + let started = Instant::now(); + let deadline = started + Self::TOTAL_TIMEOUT; + let mut last_error = None; + let mut retries = 0; + loop { + // Set up a future to wait for the backoff (if any) and run the request with a timeout. + let backoff_and_try = async { + // NB: sleep() always sleeps 1ms, even when given a 0 argument. See: + // https://github.com/tokio-rs/tokio/issues/6866 + if let Some(backoff) = Self::backoff_duration(retries) { + tokio::time::sleep(backoff).await; + } + + let request_started = Instant::now(); + tokio::time::timeout(Self::REQUEST_TIMEOUT, f()) + .await + .map_err(|_| { + tonic::Status::deadline_exceeded(format!( + "request timed out after {:.3}s", + request_started.elapsed().as_secs_f64() + )) + })? + }; + + // Wait for the backoff and request, or bail out if the total timeout is exceeded. + let result = tokio::select! { + result = backoff_and_try => result, + + _ = tokio::time::sleep_until(deadline) => { + let last_error = last_error.unwrap_or_else(|| { + tonic::Status::deadline_exceeded(format!( + "request timed out after {:.3}s", + started.elapsed().as_secs_f64() + )) + }); + error!( + "giving up after {:.3}s and {retries} retries, last error {:?}: {}", + started.elapsed().as_secs_f64(), last_error.code(), last_error.message(), + ); + return Err(last_error); + } + }; + + match result { + // Success, return the result. + Ok(result) => { + if retries > 0 || Self::LOG_SUCCESS { + info!( + "request succeeded after {retries} retries in {:.3}s", + started.elapsed().as_secs_f64(), + ); + } + + return Ok(result); + } + + // Error, retry or bail out. + Err(status) => { + let (code, message) = (status.code(), status.message()); + let attempt = retries + 1; + + if !Self::should_retry(code) { + // NB: include the attempt here too. This isn't necessarily the first + // attempt, because the error may change between attempts. + error!( + "request failed with {code:?}: {message}, not retrying (attempt {attempt})" + ); + return Err(status); + } + + warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})"); + + retries += 1; + last_error = Some(status); + } + } + } + } + + /// Returns the backoff duration for the given retry attempt, or None for no backoff. + fn backoff_duration(retry: usize) -> Option { + let backoff = exponential_backoff_duration( + retry as u32, + Self::BASE_BACKOFF.as_secs_f64(), + Self::MAX_BACKOFF.as_secs_f64(), + ); + (!backoff.is_zero()).then_some(backoff) + } + + /// Returns true if the given status code should be retries. + fn should_retry(code: tonic::Code) -> bool { + match code { + tonic::Code::Ok => panic!("unexpected Ok status code"), + + // These codes are transient, so retry them. + tonic::Code::Aborted => true, + tonic::Code::Cancelled => true, + tonic::Code::DeadlineExceeded => true, // maybe transient slowness + tonic::Code::Internal => true, // maybe transient failure? + tonic::Code::ResourceExhausted => true, + tonic::Code::Unavailable => true, + + // The following codes will like continue to fail, so don't retry. + tonic::Code::AlreadyExists => false, + tonic::Code::DataLoss => false, + tonic::Code::FailedPrecondition => false, + tonic::Code::InvalidArgument => false, + tonic::Code::NotFound => false, + tonic::Code::OutOfRange => false, + tonic::Code::PermissionDenied => false, + tonic::Code::Unauthenticated => false, + tonic::Code::Unimplemented => false, + tonic::Code::Unknown => false, + } + } +} diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs new file mode 100644 index 0000000000..5bbcaab393 --- /dev/null +++ b/pageserver/client_grpc/src/split.rs @@ -0,0 +1,172 @@ +use std::collections::HashMap; + +use bytes::Bytes; + +use pageserver_api::key::rel_block_to_key; +use pageserver_api::shard::{ShardStripeSize, key_to_shard_number}; +use pageserver_page_api as page_api; +use utils::shard::{ShardCount, ShardIndex}; + +/// Splits GetPageRequests that straddle shard boundaries and assembles the responses. +/// TODO: add tests for this. +pub struct GetPageSplitter { + /// The original request ID. Used for all shard requests. + request_id: page_api::RequestID, + /// Split requests by shard index. + requests: HashMap, + /// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble + /// the response pages in the same order as the original request. + block_shards: Vec, + /// Page responses by shard index. Will be assembled into a single response. + responses: HashMap>, +} + +impl GetPageSplitter { + /// Checks if the given request only touches a single shard, and returns the shard ID. This is + /// the common case, so we check first in order to avoid unnecessary allocations and overhead. + /// The caller must ensure that the request has at least one block number, or this will panic. + pub fn is_single_shard( + req: &page_api::GetPageRequest, + count: ShardCount, + stripe_size: ShardStripeSize, + ) -> Option { + // Fast path: unsharded tenant. + if count.is_unsharded() { + return Some(ShardIndex::unsharded()); + } + + // Find the base shard index for the first page, and compare with the rest. + let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages")); + let shard_number = key_to_shard_number(count, stripe_size, &key); + + req.block_numbers + .iter() + .skip(1) // computed above + .all(|&blkno| { + let key = rel_block_to_key(req.rel, blkno); + key_to_shard_number(count, stripe_size, &key) == shard_number + }) + .then_some(ShardIndex::new(shard_number, count)) + } + + /// Splits the given request. + pub fn split( + req: page_api::GetPageRequest, + count: ShardCount, + stripe_size: ShardStripeSize, + ) -> Self { + // The caller should make sure we don't split requests unnecessarily. + debug_assert!( + Self::is_single_shard(&req, count, stripe_size).is_none(), + "unnecessary request split" + ); + + // Split the requests by shard index. + let mut requests = HashMap::with_capacity(2); // common case + let mut block_shards = Vec::with_capacity(req.block_numbers.len()); + for blkno in req.block_numbers { + let key = rel_block_to_key(req.rel, blkno); + let shard_number = key_to_shard_number(count, stripe_size, &key); + let shard_id = ShardIndex::new(shard_number, count); + + let shard_req = requests + .entry(shard_id) + .or_insert_with(|| page_api::GetPageRequest { + request_id: req.request_id, + request_class: req.request_class, + rel: req.rel, + read_lsn: req.read_lsn, + block_numbers: Vec::new(), + }); + shard_req.block_numbers.push(blkno); + block_shards.push(shard_id); + } + + Self { + request_id: req.request_id, + responses: HashMap::with_capacity(requests.len()), + requests, + block_shards, + } + } + + /// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations. + pub fn drain_requests( + &mut self, + ) -> impl Iterator { + self.requests.drain() + } + + /// Adds a response from the given shard. + #[allow(clippy::result_large_err)] + pub fn add_response( + &mut self, + shard_id: ShardIndex, + response: page_api::GetPageResponse, + ) -> tonic::Result<()> { + // The caller should already have converted status codes into tonic::Status. + assert_eq!(response.status_code, page_api::GetPageStatusCode::Ok); + + // Make sure the response matches the request ID. + if response.request_id != self.request_id { + return Err(tonic::Status::internal(format!( + "response ID {} does not match request ID {}", + response.request_id, self.request_id + ))); + } + + // Add the response data to the map. + let old = self.responses.insert(shard_id, response.page_images); + + if old.is_some() { + return Err(tonic::Status::internal(format!( + "duplicate response for shard {shard_id}", + ))); + } + + Ok(()) + } + + /// Assembles the shard responses into a single response. Responses must be present for all + /// relevant shards, and the total number of pages must match the original request. + #[allow(clippy::result_large_err)] + pub fn assemble_response(self) -> tonic::Result { + let mut response = page_api::GetPageResponse { + request_id: self.request_id, + status_code: page_api::GetPageStatusCode::Ok, + reason: None, + page_images: Vec::with_capacity(self.block_shards.len()), + }; + + // Set up per-shard page iterators we can pull from. + let mut shard_responses = HashMap::with_capacity(self.responses.len()); + for (shard_id, responses) in self.responses { + shard_responses.insert(shard_id, responses.into_iter()); + } + + // Reassemble the responses in the same order as the original request. + for shard_id in &self.block_shards { + let page = shard_responses + .get_mut(shard_id) + .ok_or_else(|| { + tonic::Status::internal(format!("missing response for shard {shard_id}")) + })? + .next() + .ok_or_else(|| { + tonic::Status::internal(format!("missing page from shard {shard_id}")) + })?; + response.page_images.push(page); + } + + // Make sure there are no additional pages. + for (shard_id, mut pages) in shard_responses { + if pages.next().is_some() { + return Err(tonic::Status::internal(format!( + "extra pages returned from shard {shard_id}" + ))); + } + } + + Ok(response) + } +} diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 6cce2844c7..838d00e490 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -1,10 +1,101 @@ +use std::str::FromStr; + use anyhow::Context; use camino::Utf8PathBuf; -use pageserver::tenant::IndexPart; +use pageserver::tenant::{ + IndexPart, + layer_map::{LayerMap, SearchResult}, + remote_timeline_client::remote_layer_path, + storage_layer::{PersistentLayerDesc, ReadableLayerWeak}, +}; +use pageserver_api::key::Key; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, + shard::TenantShardId, +}; #[derive(clap::Subcommand)] pub(crate) enum IndexPartCmd { - Dump { path: Utf8PathBuf }, + Dump { + path: Utf8PathBuf, + }, + /// Find all layers that need to be searched to construct the given page at the given LSN. + Search { + #[arg(long)] + tenant_id: String, + #[arg(long)] + timeline_id: String, + #[arg(long)] + path: Utf8PathBuf, + #[arg(long)] + key: String, + #[arg(long)] + lsn: String, + }, +} + +async fn search_layers( + tenant_id: &str, + timeline_id: &str, + path: &Utf8PathBuf, + key: &str, + lsn: &str, +) -> anyhow::Result<()> { + let tenant_id = TenantId::from_str(tenant_id).unwrap(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let timeline_id = TimelineId::from_str(timeline_id).unwrap(); + let index_json = { + let bytes = tokio::fs::read(path).await?; + IndexPart::from_json_bytes(&bytes).unwrap() + }; + let mut layer_map = LayerMap::default(); + { + let mut updates = layer_map.batch_update(); + for (key, value) in index_json.layer_metadata.iter() { + updates.insert_historic(PersistentLayerDesc::from_filename( + tenant_shard_id, + timeline_id, + key.clone(), + value.file_size, + )); + } + } + let key = Key::from_hex(key)?; + + let lsn = Lsn::from_str(lsn).unwrap(); + let mut end_lsn = lsn; + loop { + let result = layer_map.search(key, end_lsn); + match result { + Some(SearchResult { layer, lsn_floor }) => { + let disk_layer = match layer { + ReadableLayerWeak::PersistentLayer(layer) => layer, + ReadableLayerWeak::InMemoryLayer(_) => { + anyhow::bail!("unexpected in-memory layer") + } + }; + + let metadata = index_json + .layer_metadata + .get(&disk_layer.layer_name()) + .unwrap(); + println!( + "{}", + remote_layer_path( + &tenant_id, + &timeline_id, + metadata.shard, + &disk_layer.layer_name(), + metadata.generation + ) + ); + end_lsn = lsn_floor; + } + None => break, + } + } + Ok(()) } pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { @@ -16,5 +107,12 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { println!("{output}"); Ok(()) } + IndexPartCmd::Search { + tenant_id, + timeline_id, + path, + key, + lsn, + } => search_layers(tenant_id, timeline_id, path, key, lsn).await, } } diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 65e41540b8..6523d00d3d 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -1,23 +1,151 @@ -use anyhow::Result; +use anyhow::Context as _; use futures::{Stream, StreamExt as _, TryStreamExt as _}; use tokio::io::AsyncRead; use tokio_util::io::StreamReader; +use tonic::codec::CompressionEncoding; use tonic::metadata::AsciiMetadataValue; -use tonic::metadata::errors::InvalidMetadataValue; -use tonic::transport::Channel; -use tonic::{Request, Streaming}; +use tonic::service::Interceptor; +use tonic::service::interceptor::InterceptedService; +use tonic::transport::{Channel, Endpoint}; -use utils::id::TenantId; -use utils::id::TimelineId; +use utils::id::{TenantId, TimelineId}; use utils::shard::ShardIndex; -use crate::model; +use crate::model::*; use crate::proto; -/// -/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These -/// headers are required at the pageserver. -/// +/// A basic Pageserver gRPC client, for a single tenant shard. This API uses native Rust domain +/// types from `model` rather than generated Protobuf types. +pub struct Client { + inner: proto::PageServiceClient>, +} + +impl Client { + /// Connects to the given gRPC endpoint. + pub async fn connect( + endpoint: E, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_token: Option, + compression: Option, + ) -> anyhow::Result + where + E: TryInto + Send + Sync + 'static, + >::Error: std::error::Error + Send + Sync, + { + let endpoint: Endpoint = endpoint.try_into().context("invalid endpoint")?; + let channel = endpoint.connect().await?; + Self::new( + channel, + tenant_id, + timeline_id, + shard_id, + auth_token, + compression, + ) + } + + /// Creates a new client using the given gRPC channel. + pub fn new( + channel: Channel, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_token: Option, + compression: Option, + ) -> anyhow::Result { + let auth = AuthInterceptor::new(tenant_id, timeline_id, shard_id, auth_token)?; + let mut inner = proto::PageServiceClient::with_interceptor(channel, auth); + + if let Some(compression) = compression { + // TODO: benchmark this (including network latency). + inner = inner + .accept_compressed(compression) + .send_compressed(compression); + } + + Ok(Self { inner }) + } + + /// Returns whether a relation exists. + pub async fn check_rel_exists( + &mut self, + req: CheckRelExistsRequest, + ) -> tonic::Result { + let req = proto::CheckRelExistsRequest::from(req); + let resp = self.inner.check_rel_exists(req).await?.into_inner(); + Ok(resp.into()) + } + + /// Fetches a base backup. + pub async fn get_base_backup( + &mut self, + req: GetBaseBackupRequest, + ) -> tonic::Result> { + let req = proto::GetBaseBackupRequest::from(req); + let chunks = self.inner.get_base_backup(req).await?.into_inner(); + Ok(StreamReader::new( + chunks + .map_ok(|resp| resp.chunk) + .map_err(std::io::Error::other), + )) + } + + /// Returns the total size of a database, as # of bytes. + pub async fn get_db_size(&mut self, req: GetDbSizeRequest) -> tonic::Result { + let req = proto::GetDbSizeRequest::from(req); + let resp = self.inner.get_db_size(req).await?.into_inner(); + Ok(resp.into()) + } + + /// Fetches pages. + /// + /// This is implemented as a bidirectional streaming RPC for performance. Per-request errors are + /// typically returned as status_code instead of errors, to avoid tearing down the entire stream + /// via a tonic::Status error. + pub async fn get_pages( + &mut self, + reqs: impl Stream + Send + 'static, + ) -> tonic::Result> + Send + 'static> { + let reqs = reqs.map(proto::GetPageRequest::from); + let resps = self.inner.get_pages(reqs).await?.into_inner(); + Ok(resps.map_ok(GetPageResponse::from)) + } + + /// Returns the size of a relation, as # of blocks. + pub async fn get_rel_size( + &mut self, + req: GetRelSizeRequest, + ) -> tonic::Result { + let req = proto::GetRelSizeRequest::from(req); + let resp = self.inner.get_rel_size(req).await?.into_inner(); + Ok(resp.into()) + } + + /// Fetches an SLRU segment. + pub async fn get_slru_segment( + &mut self, + req: GetSlruSegmentRequest, + ) -> tonic::Result { + let req = proto::GetSlruSegmentRequest::from(req); + let resp = self.inner.get_slru_segment(req).await?.into_inner(); + Ok(resp.try_into()?) + } + + /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't + /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards. + /// + /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be + /// acquired because the LSN has already been garbage collected. + pub async fn lease_lsn(&mut self, req: LeaseLsnRequest) -> tonic::Result { + let req = proto::LeaseLsnRequest::from(req); + let resp = self.inner.lease_lsn(req).await?.into_inner(); + Ok(resp.try_into()?) + } +} + +/// Adds authentication metadata to gRPC requests. #[derive(Clone)] struct AuthInterceptor { tenant_id: AsciiMetadataValue, @@ -30,174 +158,29 @@ impl AuthInterceptor { fn new( tenant_id: TenantId, timeline_id: TimelineId, - auth_token: Option, shard_id: ShardIndex, - ) -> Result { - let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?; - let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?; - let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?; - - let auth_header: Option = match auth_token { - Some(token) => Some(format!("Bearer {token}").try_into()?), - None => None, - }; - + auth_token: Option, + ) -> anyhow::Result { Ok(Self { - tenant_id: tenant_ascii, - shard_id: shard_ascii, - timeline_id: timeline_ascii, - auth_header, + tenant_id: tenant_id.to_string().try_into()?, + timeline_id: timeline_id.to_string().try_into()?, + shard_id: shard_id.to_string().try_into()?, + auth_header: auth_token + .map(|token| format!("Bearer {token}").try_into()) + .transpose()?, }) } } -impl tonic::service::Interceptor for AuthInterceptor { - fn call(&mut self, mut req: tonic::Request<()>) -> Result, tonic::Status> { - req.metadata_mut() - .insert("neon-tenant-id", self.tenant_id.clone()); - req.metadata_mut() - .insert("neon-shard-id", self.shard_id.clone()); - req.metadata_mut() - .insert("neon-timeline-id", self.timeline_id.clone()); - if let Some(auth_header) = &self.auth_header { - req.metadata_mut() - .insert("authorization", auth_header.clone()); +impl Interceptor for AuthInterceptor { + fn call(&mut self, mut req: tonic::Request<()>) -> tonic::Result> { + let metadata = req.metadata_mut(); + metadata.insert("neon-tenant-id", self.tenant_id.clone()); + metadata.insert("neon-timeline-id", self.timeline_id.clone()); + metadata.insert("neon-shard-id", self.shard_id.clone()); + if let Some(ref auth_header) = self.auth_header { + metadata.insert("authorization", auth_header.clone()); } Ok(req) } } - -#[derive(Clone)] -pub struct Client { - client: proto::PageServiceClient< - tonic::service::interceptor::InterceptedService, - >, -} - -impl Client { - pub async fn new + Send + Sync + 'static>( - into_endpoint: T, - tenant_id: TenantId, - timeline_id: TimelineId, - shard_id: ShardIndex, - auth_header: Option, - compression: Option, - ) -> anyhow::Result { - let endpoint: tonic::transport::Endpoint = into_endpoint - .try_into() - .map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?; - let channel = endpoint.connect().await?; - let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id) - .map_err(|e| anyhow::anyhow!(e.to_string()))?; - let mut client = proto::PageServiceClient::with_interceptor(channel, auth); - - if let Some(compression) = compression { - // TODO: benchmark this (including network latency). - client = client - .accept_compressed(compression) - .send_compressed(compression); - } - - Ok(Self { client }) - } - - /// Returns whether a relation exists. - pub async fn check_rel_exists( - &mut self, - req: model::CheckRelExistsRequest, - ) -> Result { - let proto_req = proto::CheckRelExistsRequest::from(req); - - let response = self.client.check_rel_exists(proto_req).await?; - - let proto_resp = response.into_inner(); - Ok(proto_resp.into()) - } - - /// Fetches a base backup. - pub async fn get_base_backup( - &mut self, - req: model::GetBaseBackupRequest, - ) -> Result, tonic::Status> { - let req = proto::GetBaseBackupRequest::from(req); - let chunks = self.client.get_base_backup(req).await?.into_inner(); - let reader = StreamReader::new( - chunks - .map_ok(|resp| resp.chunk) - .map_err(std::io::Error::other), - ); - Ok(reader) - } - - /// Returns the total size of a database, as # of bytes. - pub async fn get_db_size( - &mut self, - req: model::GetDbSizeRequest, - ) -> Result { - let proto_req = proto::GetDbSizeRequest::from(req); - - let response = self.client.get_db_size(proto_req).await?; - Ok(response.into_inner().into()) - } - - /// Fetches pages. - /// - /// This is implemented as a bidirectional streaming RPC for performance. - /// Per-request errors are often returned as status_code instead of errors, - /// to avoid tearing down the entire stream via tonic::Status. - pub async fn get_pages( - &mut self, - inbound: ReqSt, - ) -> Result< - impl Stream> + Send + 'static, - tonic::Status, - > - where - ReqSt: Stream + Send + 'static, - { - let outbound_proto = inbound.map(|domain_req| domain_req.into()); - - let req_new = Request::new(outbound_proto); - - let response_stream: Streaming = - self.client.get_pages(req_new).await?.into_inner(); - - let domain_stream = response_stream.map_ok(model::GetPageResponse::from); - - Ok(domain_stream) - } - - /// Returns the size of a relation, as # of blocks. - pub async fn get_rel_size( - &mut self, - req: model::GetRelSizeRequest, - ) -> Result { - let proto_req = proto::GetRelSizeRequest::from(req); - let response = self.client.get_rel_size(proto_req).await?; - let proto_resp = response.into_inner(); - Ok(proto_resp.into()) - } - - /// Fetches an SLRU segment. - pub async fn get_slru_segment( - &mut self, - req: model::GetSlruSegmentRequest, - ) -> Result { - let proto_req = proto::GetSlruSegmentRequest::from(req); - let response = self.client.get_slru_segment(proto_req).await?; - Ok(response.into_inner().try_into()?) - } - - /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't - /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards. - /// - /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be - /// acquired because the LSN has already been garbage collected. - pub async fn lease_lsn( - &mut self, - req: model::LeaseLsnRequest, - ) -> Result { - let req = proto::LeaseLsnRequest::from(req); - Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?) - } -} diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 4497fc6fc7..d0d3517d41 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -384,7 +384,7 @@ impl From for proto::GetPageRequest { pub type RequestID = u64; /// A GetPage request class. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, strum_macros::Display)] pub enum GetPageClass { /// Unknown class. For backwards compatibility: used when an older client version sends a class /// that a newer server version has removed. @@ -397,6 +397,19 @@ pub enum GetPageClass { Background, } +impl GetPageClass { + /// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than + /// latency-sensitive). + pub fn is_bulk(&self) -> bool { + match self { + Self::Unknown => false, + Self::Normal => false, + Self::Prefetch => true, + Self::Background => true, + } + } +} + impl From for GetPageClass { fn from(pb: proto::GetPageClass) -> Self { match pb { @@ -602,6 +615,21 @@ impl TryFrom for GetPageStatusCode { } } +impl From for tonic::Code { + fn from(status_code: GetPageStatusCode) -> Self { + use tonic::Code; + + match status_code { + GetPageStatusCode::Unknown => Code::Unknown, + GetPageStatusCode::Ok => Code::Ok, + GetPageStatusCode::NotFound => Code::NotFound, + GetPageStatusCode::InvalidRequest => Code::InvalidArgument, + GetPageStatusCode::InternalError => Code::Internal, + GetPageStatusCode::SlowDown => Code::ResourceExhausted, + } + } +} + // Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other // shards will error. #[derive(Clone, Copy, Debug)] diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 4b7a70504a..c14bb73136 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -326,7 +326,7 @@ impl GrpcClient { ttid: TenantTimelineId, compression: bool, ) -> anyhow::Result { - let inner = page_api::Client::new( + let inner = page_api::Client::connect( connstring.to_string(), ttid.tenant_id, ttid.timeline_id, diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index a297819e9b..f14caf548c 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -625,7 +625,7 @@ impl GrpcClient { ttid: TenantTimelineId, compression: bool, ) -> anyhow::Result { - let mut client = page_api::Client::new( + let mut client = page_api::Client::connect( connstring.to_string(), ttid.tenant_id, ttid.timeline_id, diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 6e22f9f36e..99d7e0ca3a 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -28,7 +28,6 @@ use reqwest::Url; use storage_broker::Uri; use utils::id::{NodeId, TimelineId}; use utils::logging::{LogFormat, SecretString}; -use utils::serde_percent::Percent; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; @@ -146,7 +145,7 @@ pub struct PageServerConf { pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, - pub disk_usage_based_eviction: Option, + pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, pub test_remote_failures: u64, @@ -460,16 +459,7 @@ impl PageServerConf { metric_collection_endpoint, metric_collection_bucket, synthetic_size_calculation_interval, - disk_usage_based_eviction: Some(disk_usage_based_eviction.unwrap_or( - DiskUsageEvictionTaskConfig { - max_usage_pct: Percent::new(80).unwrap(), - min_avail_bytes: 2_000_000_000, - period: Duration::from_secs(60), - #[cfg(feature = "testing")] - mock_statvfs: None, - eviction_order: Default::default(), - }, - )), + disk_usage_based_eviction, test_remote_failures, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, @@ -719,8 +709,9 @@ mod tests { use std::time::Duration; use camino::Utf8PathBuf; + use pageserver_api::config::{DiskUsageEvictionTaskConfig, EvictionOrder}; use rstest::rstest; - use utils::id::NodeId; + use utils::{id::NodeId, serde_percent::Percent}; use super::PageServerConf; @@ -820,19 +811,69 @@ mod tests { .expect("parse_and_validate"); } - #[test] - fn test_config_disk_usage_based_eviction_is_valid() { - let input = r#" + #[rstest] + #[ + case::omit_the_whole_config( + DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 2_000_000_000, + period: Duration::from_secs(60), + eviction_order: Default::default(), + #[cfg(feature = "testing")] + mock_statvfs: None, + enabled: true, + }, + r#" control_plane_api = "http://localhost:6666" - "#; + "#, + )] + #[ + case::omit_enabled_field( + DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 1_000_000_000, + period: Duration::from_secs(60), + eviction_order: EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: true, + }, + #[cfg(feature = "testing")] + mock_statvfs: None, + enabled: true, + }, + r#" + control_plane_api = "http://localhost:6666" + disk_usage_based_eviction = { max_usage_pct = 80, min_avail_bytes = 1000000000, period = "60s" } + "#, + )] + #[case::disabled( + DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 2_000_000_000, + period: Duration::from_secs(60), + eviction_order: EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: true, + }, + #[cfg(feature = "testing")] + mock_statvfs: None, + enabled: false, + }, + r#" + control_plane_api = "http://localhost:6666" + disk_usage_based_eviction = { enabled = false } + "# + )] + fn test_config_disk_usage_based_eviction_is_valid( + #[case] expected_disk_usage_based_eviction: DiskUsageEvictionTaskConfig, + #[case] input: &str, + ) { let config_toml = toml_edit::de::from_str::(input) .expect("disk_usage_based_eviction is valid"); let workdir = Utf8PathBuf::from("/nonexistent"); let config = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir).unwrap(); - let disk_usage_based_eviction = config.disk_usage_based_eviction.unwrap(); - assert_eq!(disk_usage_based_eviction.max_usage_pct.get(), 80); - assert_eq!(disk_usage_based_eviction.min_avail_bytes, 2_000_000_000); - assert_eq!(disk_usage_based_eviction.period, Duration::from_secs(60)); - assert_eq!(disk_usage_based_eviction.eviction_order, Default::default()); + let disk_usage_based_eviction = config.disk_usage_based_eviction; + assert_eq!( + expected_disk_usage_based_eviction, + disk_usage_based_eviction + ); } } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index f13b3709f5..f1d34664a8 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -171,7 +171,8 @@ pub fn launch_disk_usage_global_eviction_task( tenant_manager: Arc, background_jobs_barrier: completion::Barrier, ) -> Option { - let Some(task_config) = &conf.disk_usage_based_eviction else { + let task_config = &conf.disk_usage_based_eviction; + if !task_config.enabled { info!("disk usage based eviction task not configured"); return None; }; @@ -458,6 +459,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( match next { Ok(Ok(file_size)) => { METRICS.layers_evicted.inc(); + /*BEGIN_HADRON */ + METRICS.bytes_evicted.inc_by(file_size); + /*END_HADRON */ usage_assumed.add_available_bytes(file_size); } Ok(Err(( @@ -1265,6 +1269,7 @@ mod filesystem_level_usage { #[cfg(feature = "testing")] mock_statvfs: None, eviction_order: pageserver_api::config::EvictionOrder::default(), + enabled: true, }, total_bytes: 100_000, avail_bytes: 0, diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 6ce4522080..f0178fd9b3 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -1,4 +1,8 @@ -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::{ + collections::HashMap, + sync::{Arc, atomic::AtomicBool}, + time::Duration, +}; use arc_swap::ArcSwap; use pageserver_api::config::NodeMetadata; @@ -355,11 +359,17 @@ impl PerTenantProperties { } } -#[derive(Clone)] pub struct TenantFeatureResolver { inner: FeatureResolver, tenant_id: TenantId, - cached_tenant_properties: Arc>>, + cached_tenant_properties: ArcSwap>, + + // Add feature flag on the critical path below. + // + // If a feature flag will be used on the critical path, we will update it in the tenant housekeeping loop insetad of + // resolving directly by calling `evaluate_multivariate` or `evaluate_boolean`. Remember to update the flag in the + // housekeeping loop. The user should directly read this atomic flag instead of using the set of evaluate functions. + pub feature_test_remote_size_flag: AtomicBool, } impl TenantFeatureResolver { @@ -367,7 +377,8 @@ impl TenantFeatureResolver { Self { inner, tenant_id, - cached_tenant_properties: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), + cached_tenant_properties: ArcSwap::new(Arc::new(HashMap::new())), + feature_test_remote_size_flag: AtomicBool::new(false), } } @@ -396,12 +407,14 @@ impl TenantFeatureResolver { self.inner.is_feature_flag_boolean(flag_key) } - pub fn update_cached_tenant_properties(&self, tenant_shard: &TenantShard) { - let mut remote_size_mb = None; + /// Refresh the cached properties and flags on the critical path. + pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) { + let mut remote_size_mb = Some(0.0); for timeline in tenant_shard.list_timelines() { let size = timeline.metrics.resident_physical_size_get(); if size == 0 { remote_size_mb = None; + break; } if let Some(ref mut remote_size_mb) = remote_size_mb { *remote_size_mb += size as f64 / 1024.0 / 1024.0; @@ -410,5 +423,12 @@ impl TenantFeatureResolver { self.cached_tenant_properties.store(Arc::new( PerTenantProperties { remote_size_mb }.into_posthog_properties(), )); + + // BEGIN: Update the feature flag on the critical path. + self.feature_test_remote_size_flag.store( + self.evaluate_boolean("test-remote-size-flag").is_ok(), + std::sync::atomic::Ordering::Relaxed, + ); + // END: Update the feature flag on the critical path. } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index e8d1367d6c..3ffc80f19a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -116,26 +116,6 @@ paths: schema: type: string - /v1/tenant/{tenant_id}/timeline: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - get: - description: Get timelines for tenant - responses: - "200": - description: TimelineInfo - content: - application/json: - schema: - type: array - items: - $ref: "#/components/schemas/TimelineInfo" - - /v1/tenant/{tenant_id}/timeline/{timeline_id}: parameters: - name: tenant_id @@ -618,7 +598,7 @@ paths: schema: $ref: "#/components/schemas/SecondaryProgress" - /v1/tenant/{tenant_id}/timeline/: + /v1/tenant/{tenant_id}/timeline: parameters: - name: tenant_id in: path @@ -685,6 +665,17 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + get: + description: Get timelines for tenant + responses: + "200": + description: TimelineInfo + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/TimelineInfo" /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor: parameters: @@ -767,7 +758,7 @@ paths: $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/: + /v1/tenant: get: description: Get tenants list responses: @@ -847,7 +838,7 @@ paths: items: $ref: "#/components/schemas/TenantInfo" - /v1/tenant/{tenant_id}/config/: + /v1/tenant/{tenant_id}/config: parameters: - name: tenant_id in: path diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 02094e6aa9..3612686b5d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -61,6 +61,7 @@ use crate::context; use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; use crate::feature_resolver::FeatureResolver; +use crate::metrics::LOCAL_DATA_LOSS_SUSPECTED; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationConf; @@ -2501,10 +2502,7 @@ async fn timeline_checkpoint_handler( .map_err(|e| match e { CompactionError::ShuttingDown => ApiError::ShuttingDown, - CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), - CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::Other(e) => ApiError::InternalServerError(e), - CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)), } )?; } @@ -3630,6 +3628,17 @@ async fn activate_post_import_handler( .await } +// [Hadron] Reset gauge metrics that are used to raised alerts. We need this API as a stop-gap measure to reset alerts +// after we manually rectify situations such as local SSD data loss. We will eventually automate this. +async fn hadron_reset_alert_gauges( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&request, None)?; + LOCAL_DATA_LOSS_SUSPECTED.set(0); + json_response(StatusCode::OK, ()) +} + /// Read the end of a tar archive. /// /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. @@ -3682,6 +3691,23 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow Ok(()) } +async fn force_refresh_feature_flag( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant + .feature_resolver + .refresh_properties_and_flags(&tenant); + json_response(StatusCode::OK, ()) +} + async fn tenant_evaluate_feature_flag( request: Request, _cancel: CancellationToken, @@ -3698,7 +3724,7 @@ async fn tenant_evaluate_feature_flag( let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; - // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s) + // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s) // and we don't need to worry about it for now. let properties = tenant.feature_resolver.collect_properties(); if as_type.as_deref() == Some("boolean") { @@ -4147,6 +4173,9 @@ pub fn make_router( .get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| { api_handler(r, tenant_evaluate_feature_flag) }) + .post("/v1/tenant/:tenant_shard_id/force_refresh_feature_flag", |r| { + api_handler(r, force_refresh_feature_flag) + }) .put("/v1/feature_flag/:flag_key", |r| { testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put) }) @@ -4156,5 +4185,8 @@ pub fn make_router( .post("/v1/feature_flag_spec", |r| { api_handler(r, update_feature_flag_spec) }) + .post("/hadron-internal/reset_alert_gauges", |r| { + api_handler(r, hadron_reset_alert_gauges) + }) .any(handler_404)) } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 21faceef49..eb89e166b2 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,3 +1,4 @@ +use std::cell::Cell; use std::collections::HashMap; use std::num::NonZeroUsize; use std::os::fd::RawFd; @@ -102,7 +103,18 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::n .expect("failed to define a metric") }); -// Buckets for background operation duration in seconds, like compaction, GC, size calculation. +/* BEGIN_HADRON */ +pub(crate) static STORAGE_ACTIVE_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_active_storage_operations_count", + "Count of active storage operations with operation, tenant and timeline dimensions", + &["operation", "tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); +/*END_HADRON */ + +// Buckets for background operations like compaction, GC, size calculation const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0]; pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { @@ -2810,6 +2822,31 @@ pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy = pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy = Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"])); +pub(crate) static LOCAL_DATA_LOSS_SUSPECTED: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_local_data_loss_suspected", + "Non-zero value indicates that pageserver local data loss is suspected (and highly likely)." + ) + .expect("failed to define a metric") +}); + +// Counter keeping track of misrouted PageStream requests. Spelling out PageStream requests here to distinguish +// it from other types of reqeusts (SK wal replication, http requests, etc.). PageStream requests are used by +// Postgres compute to fetch data from pageservers. +// A misrouted PageStream request is registered if the pageserver cannot find the tenant identified in the +// request, or if the pageserver is not the "primary" serving the tenant shard. These error almost always identify +// issues with compute configuration, caused by either the compute node itself being stuck in the wrong +// configuration or Storage Controller reconciliation bugs. Misrouted requests are expected during tenant migration +// and/or during recovery following a pageserver failure, but persistently high rates of misrouted requests +// are indicative of bugs (and unavailability). +pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_misrouted_pagestream_requests_total", + "Number of pageserver pagestream requests that were routed to the wrong pageserver" + ) + .expect("failed to define a metric") +}); + // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting @@ -3048,13 +3085,19 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = pub(crate) struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, start: Instant, + stopped: Cell, } impl StorageTimeMetricsTimer { fn new(metrics: StorageTimeMetrics) -> Self { + /*BEGIN_HADRON */ + // record the active operation as the timer starts + metrics.timeline_active_count.inc(); + /*END_HADRON */ Self { metrics, start: Instant::now(), + stopped: Cell::new(false), } } @@ -3070,6 +3113,10 @@ impl StorageTimeMetricsTimer { self.metrics.timeline_sum.inc_by(seconds); self.metrics.timeline_count.inc(); self.metrics.global_histogram.observe(seconds); + /* BEGIN_HADRON*/ + self.stopped.set(true); + self.metrics.timeline_active_count.dec(); + /*END_HADRON */ duration } @@ -3080,6 +3127,16 @@ impl StorageTimeMetricsTimer { } } +/*BEGIN_HADRON */ +impl Drop for StorageTimeMetricsTimer { + fn drop(&mut self) { + if !self.stopped.get() { + self.metrics.timeline_active_count.dec(); + } + } +} +/*END_HADRON */ + pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option); impl Drop for AlwaysRecordingStorageTimeMetricsTimer { @@ -3105,6 +3162,10 @@ pub(crate) struct StorageTimeMetrics { timeline_sum: Counter, /// Number of oeprations, per operation, tenant_id and timeline_id timeline_count: IntCounter, + /*BEGIN_HADRON */ + /// Number of active operations per operation, tenant_id, and timeline_id + timeline_active_count: IntGauge, + /*END_HADRON */ /// Global histogram having only the "operation" label. global_histogram: Histogram, } @@ -3124,6 +3185,11 @@ impl StorageTimeMetrics { let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id]) .unwrap(); + /*BEGIN_HADRON */ + let timeline_active_count = STORAGE_ACTIVE_COUNT_PER_TIMELINE + .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id]) + .unwrap(); + /*END_HADRON */ let global_histogram = STORAGE_TIME_GLOBAL .get_metric_with_label_values(&[operation]) .unwrap(); @@ -3131,6 +3197,7 @@ impl StorageTimeMetrics { StorageTimeMetrics { timeline_sum, timeline_count, + timeline_active_count, global_histogram, } } @@ -3544,6 +3611,14 @@ impl TimelineMetrics { shard_id, timeline_id, ]); + /* BEGIN_HADRON */ + let _ = STORAGE_ACTIVE_COUNT_PER_TIMELINE.remove_label_values(&[ + op, + tenant_id, + shard_id, + timeline_id, + ]); + /*END_HADRON */ } for op in StorageIoSizeOperation::VARIANTS { @@ -4336,6 +4411,9 @@ pub(crate) mod disk_usage_based_eviction { pub(crate) layers_collected: IntCounter, pub(crate) layers_selected: IntCounter, pub(crate) layers_evicted: IntCounter, + /*BEGIN_HADRON */ + pub(crate) bytes_evicted: IntCounter, + /*END_HADRON */ } impl Default for Metrics { @@ -4372,12 +4450,21 @@ pub(crate) mod disk_usage_based_eviction { ) .unwrap(); + /*BEGIN_HADRON */ + let bytes_evicted = register_int_counter!( + "pageserver_disk_usage_based_eviction_evicted_bytes_total", + "Amount of bytes successfully evicted" + ) + .unwrap(); + /*END_HADRON */ + Self { tenant_collection_time, tenant_layer_count, layers_collected, layers_selected, layers_evicted, + bytes_evicted, } } } @@ -4497,6 +4584,7 @@ pub fn preinitialize_metrics( &CIRCUIT_BREAKERS_UNBROKEN, &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL, &WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS, + &MISROUTED_PAGESTREAM_REQUESTS, ] .into_iter() .for_each(|c| { @@ -4534,6 +4622,7 @@ pub fn preinitialize_metrics( // gauges WALRECEIVER_ACTIVE_MANAGERS.get(); + LOCAL_DATA_LOSS_SUSPECTED.get(); // histograms [ diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 49928a9036..70fdb2e789 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -70,7 +70,7 @@ use crate::context::{ }; use crate::metrics::{ self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS, - SmgrOpTimer, TimelineMetrics, + MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics, }; use crate::pgdatadir_mapping::{LsnRange, Version}; use crate::span::{ @@ -91,7 +91,8 @@ use crate::{CancellableTask, PERF_TRACE_TARGET, timed_after_cancellation}; /// is not yet in state [`TenantState::Active`]. /// /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. -const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); +/// HADRON: reduced timeout and we will retry in Cache::get(). +const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); /// Threshold at which to log slow GetPage requests. const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); @@ -1128,6 +1129,7 @@ impl PageServerHandler { // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration // and talk to a different pageserver. + MISROUTED_PAGESTREAM_REQUESTS.inc(); return respond_error!( span, PageStreamError::Reconnect( @@ -3351,6 +3353,8 @@ impl GrpcPageServiceHandler { /// NB: errors returned from here are intercepted in get_pages(), and may be converted to a /// GetPageResponse with an appropriate status code to avoid terminating the stream. /// + /// TODO: verify that the requested pages belong to this shard. + /// /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or /// split them up in the client or server. diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 31f38d485f..8532a6938f 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -141,6 +141,23 @@ pub(crate) enum CollectKeySpaceError { Cancelled, } +impl CollectKeySpaceError { + pub(crate) fn is_cancel(&self) -> bool { + match self { + CollectKeySpaceError::Decode(_) => false, + CollectKeySpaceError::PageRead(e) => e.is_cancel(), + CollectKeySpaceError::Cancelled => true, + } + } + pub(crate) fn into_anyhow(self) -> anyhow::Error { + match self { + CollectKeySpaceError::Decode(e) => anyhow::Error::new(e), + CollectKeySpaceError::PageRead(e) => anyhow::Error::new(e), + CollectKeySpaceError::Cancelled => anyhow::Error::new(self), + } + } +} + impl From for CollectKeySpaceError { fn from(err: PageReconstructError) -> Self { match err { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 9aabd6341f..f576119db8 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -142,6 +142,9 @@ mod gc_block; mod gc_result; pub(crate) mod throttle; +#[cfg(test)] +pub mod debug; + pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; @@ -388,7 +391,7 @@ pub struct TenantShard { l0_flush_global_state: L0FlushGlobalState, - pub(crate) feature_resolver: TenantFeatureResolver, + pub(crate) feature_resolver: Arc, } impl std::fmt::Debug for TenantShard { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -3288,7 +3291,9 @@ impl TenantShard { // Ignore this, we likely raced with unarchival. OffloadError::NotArchived => Ok(()), OffloadError::AlreadyInProgress => Ok(()), - err => Err(err), + OffloadError::Cancelled => Err(CompactionError::ShuttingDown), + // don't break the anyhow chain + OffloadError::Other(err) => Err(CompactionError::Other(err)), })?; } @@ -3319,23 +3324,12 @@ impl TenantShard { match err { err if err.is_cancel() => {} CompactionError::ShuttingDown => (), - // Offload failures don't trip the circuit breaker, since they're cheap to retry and - // shouldn't block compaction. - CompactionError::Offload(_) => {} - CompactionError::CollectKeySpaceError(err) => { - // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch. - self.compaction_circuit_breaker - .lock() - .unwrap() - .fail(&CIRCUIT_BREAKERS_BROKEN, err); - } CompactionError::Other(err) => { self.compaction_circuit_breaker .lock() .unwrap() .fail(&CIRCUIT_BREAKERS_BROKEN, err); } - CompactionError::AlreadyRunning(_) => {} } } @@ -3413,7 +3407,7 @@ impl TenantShard { } // Update the feature resolver with the latest tenant-spcific data. - self.feature_resolver.update_cached_tenant_properties(self); + self.feature_resolver.refresh_properties_and_flags(self); } pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { @@ -4502,10 +4496,10 @@ impl TenantShard { gc_block: Default::default(), l0_flush_global_state, basebackup_cache, - feature_resolver: TenantFeatureResolver::new( + feature_resolver: Arc::new(TenantFeatureResolver::new( feature_resolver, tenant_shard_id.tenant_id, - ), + )), } } @@ -6017,12 +6011,11 @@ pub(crate) mod harness { } #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] - pub(crate) async fn do_try_load( + pub(crate) async fn do_try_load_with_redo( &self, + walredo_mgr: Arc, ctx: &RequestContext, ) -> anyhow::Result> { - let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); - let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None); let tenant = Arc::new(TenantShard::new( @@ -6060,6 +6053,14 @@ pub(crate) mod harness { Ok(tenant) } + pub(crate) async fn do_try_load( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); + self.do_try_load_with_redo(walredo_mgr, ctx).await + } + pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf { self.conf.timeline_path(&self.tenant_shard_id, timeline_id) } diff --git a/pageserver/src/tenant/debug.rs b/pageserver/src/tenant/debug.rs new file mode 100644 index 0000000000..604f7f265e --- /dev/null +++ b/pageserver/src/tenant/debug.rs @@ -0,0 +1,366 @@ +use std::{ops::Range, str::FromStr, sync::Arc}; + +use crate::walredo::RedoAttemptType; +use base64::{Engine as _, engine::general_purpose::STANDARD}; +use bytes::{Bytes, BytesMut}; +use camino::Utf8PathBuf; +use clap::Parser; +use itertools::Itertools; +use pageserver_api::{ + key::Key, + keyspace::KeySpace, + shard::{ShardIdentity, ShardStripeSize}, +}; +use postgres_ffi::PgMajorVersion; +use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn}; +use tracing::Instrument; +use utils::{ + generation::Generation, + id::{TenantId, TimelineId}, + lsn::Lsn, + shard::{ShardCount, ShardIndex, ShardNumber}, +}; +use wal_decoder::models::record::NeonWalRecord; + +use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + tenant::storage_layer::ValueReconstructState, + walredo::harness::RedoHarness, +}; + +use super::{ + WalRedoManager, WalredoManagerId, + harness::TenantHarness, + remote_timeline_client::LayerFileMetadata, + storage_layer::{AsLayerDesc, IoConcurrency, Layer, LayerName, ValuesReconstructState}, +}; + +fn process_page_image(next_record_lsn: Lsn, is_fpw: bool, img_bytes: Bytes) -> Bytes { + // To match the logic in libs/wal_decoder/src/serialized_batch.rs + let mut new_image: BytesMut = img_bytes.into(); + if is_fpw && !page_is_new(&new_image) { + page_set_lsn(&mut new_image, next_record_lsn); + } + assert_eq!(new_image.len(), BLCKSZ as usize); + new_image.freeze() +} + +async fn redo_wals(input: &str, key: Key) -> anyhow::Result<()> { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + let redo_harness = RedoHarness::new()?; + let span = redo_harness.span(); + let tenant_conf = pageserver_api::models::TenantConfig { + ..Default::default() + }; + + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let tenant = TenantHarness::create_custom( + "search_key", + tenant_conf, + tenant_id, + ShardIdentity::unsharded(), + Generation::new(1), + ) + .await? + .do_try_load_with_redo( + Arc::new(WalRedoManager::Prod( + WalredoManagerId::next(), + redo_harness.manager, + )), + &ctx, + ) + .await + .unwrap(); + let timeline = tenant + .create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx) + .await?; + let contents = tokio::fs::read_to_string(input) + .await + .map_err(|e| anyhow::Error::msg(format!("Failed to read input file {input}: {e}"))) + .unwrap(); + let lines = contents.lines(); + let mut last_wal_lsn: Option = None; + let state = { + let mut state = ValueReconstructState::default(); + let mut is_fpw = false; + let mut is_first_line = true; + for line in lines { + if is_first_line { + is_first_line = false; + if line.trim() == "FPW" { + is_fpw = true; + } + continue; // Skip the first line. + } + // Each input line is in the "," format. + let (lsn_str, payload_b64) = line + .split_once(',') + .expect("Invalid input format: expected ','"); + + // Parse the LSN and decode the payload. + let lsn = Lsn::from_str(lsn_str.trim()).expect("Invalid LSN format"); + let bytes = Bytes::from( + STANDARD + .decode(payload_b64.trim()) + .expect("Invalid base64 payload"), + ); + + // The first line is considered the base image, the rest are WAL records. + if state.img.is_none() { + state.img = Some((lsn, process_page_image(lsn, is_fpw, bytes))); + } else { + let wal_record = NeonWalRecord::Postgres { + will_init: false, + rec: bytes, + }; + state.records.push((lsn, wal_record)); + last_wal_lsn.replace(lsn); + } + } + state + }; + + assert!(state.img.is_some(), "No base image found"); + assert!(!state.records.is_empty(), "No WAL records found"); + let result = timeline + .reconstruct_value(key, last_wal_lsn.unwrap(), state, RedoAttemptType::ReadPage) + .instrument(span.clone()) + .await?; + + eprintln!("final image: {:?}", STANDARD.encode(result)); + + Ok(()) +} + +async fn search_key( + tenant_id: TenantId, + timeline_id: TimelineId, + dir: String, + key: Key, + lsn: Lsn, +) -> anyhow::Result<()> { + let shard_index = ShardIndex { + shard_number: ShardNumber(0), + shard_count: ShardCount(4), + }; + + let redo_harness = RedoHarness::new()?; + let span = redo_harness.span(); + let tenant_conf = pageserver_api::models::TenantConfig { + ..Default::default() + }; + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let tenant = TenantHarness::create_custom( + "search_key", + tenant_conf, + tenant_id, + ShardIdentity::new( + shard_index.shard_number, + shard_index.shard_count, + ShardStripeSize(32768), + ) + .unwrap(), + Generation::new(1), + ) + .await? + .do_try_load_with_redo( + Arc::new(WalRedoManager::Prod( + WalredoManagerId::next(), + redo_harness.manager, + )), + &ctx, + ) + .await + .unwrap(); + + let timeline = tenant + .create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx) + .await?; + + let mut delta_layers: Vec = Vec::new(); + let mut img_layer: Option = Option::None; + let mut dir = tokio::fs::read_dir(dir).await?; + loop { + let entry = dir.next_entry().await?; + if entry.is_none() || !entry.as_ref().unwrap().file_type().await?.is_file() { + break; + } + let path = Utf8PathBuf::from_path_buf(entry.unwrap().path()).unwrap(); + let layer_name = match LayerName::from_str(path.file_name().unwrap()) { + Ok(name) => name, + Err(_) => { + eprintln!("Skipped invalid layer: {path}"); + continue; + } + }; + let layer = Layer::for_resident( + tenant.conf, + &timeline, + path.clone(), + layer_name, + LayerFileMetadata::new( + tokio::fs::metadata(path.clone()).await?.len(), + Generation::new(1), + shard_index, + ), + ); + if layer.layer_desc().is_delta() { + delta_layers.push(layer.into()); + } else if img_layer.is_none() { + img_layer = Some(layer.into()); + } else { + anyhow::bail!("Found multiple image layers"); + } + } + // sort delta layers based on the descending order of LSN + delta_layers.sort_by(|a, b| { + b.layer_desc() + .get_lsn_range() + .start + .cmp(&a.layer_desc().get_lsn_range().start) + }); + + let mut state = ValuesReconstructState::new(IoConcurrency::Sequential); + + let key_space = KeySpace::single(Range { + start: key, + end: key.next(), + }); + let lsn_range = Range { + start: img_layer + .as_ref() + .map_or(Lsn(0x00), |img| img.layer_desc().image_layer_lsn()), + end: lsn, + }; + for delta_layer in delta_layers.iter() { + delta_layer + .get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx) + .await?; + } + + img_layer + .as_ref() + .unwrap() + .get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx) + .await?; + + for (_key, result) in std::mem::take(&mut state.keys) { + let state = result.collect_pending_ios().await?; + if state.img.is_some() { + eprintln!( + "image: {}: {:x?}", + state.img.as_ref().unwrap().0, + STANDARD.encode(state.img.as_ref().unwrap().1.clone()) + ); + } + for delta in state.records.iter() { + match &delta.1 { + NeonWalRecord::Postgres { will_init, rec } => { + eprintln!( + "delta: {}: will_init: {}, {:x?}", + delta.0, + will_init, + STANDARD.encode(rec) + ); + } + _ => { + eprintln!("delta: {}: {:x?}", delta.0, delta.1); + } + } + } + + let result = timeline + .reconstruct_value(key, lsn_range.end, state, RedoAttemptType::ReadPage) + .instrument(span.clone()) + .await?; + eprintln!("final image: {lsn} : {result:?}"); + } + + Ok(()) +} + +/// Redo all WALs against the base image in the input file. Return the base64 encoded final image. +/// Each line in the input file must be in the form "," where: +/// * `` is a PostgreSQL LSN in hexadecimal notation, e.g. `0/16ABCDE`. +/// * `` is the base64‐encoded page image (first line) or WAL record (subsequent lines). +/// +/// The first line provides the base image of a page. The LSN is the LSN of "next record" following +/// the record containing the FPI. For example, if the FPI was extracted from a WAL record occuping +/// [0/1, 0/200) in the WAL stream, the LSN appearing along side the page image here should be 0/200. +/// +/// The subsequent lines are WAL records, ordered from the oldest to the newest. The LSN is the +/// record LSN of the WAL record, not the "next record" LSN. For example, if the WAL record here +/// occupies [0/1, 0/200) in the WAL stream, the LSN appearing along side the WAL record here should +/// be 0/1. +#[derive(Parser)] +struct RedoWalsCmd { + #[clap(long)] + input: String, + #[clap(long)] + key: String, +} + +#[tokio::test] +async fn test_redo_wals() -> anyhow::Result<()> { + let args = std::env::args().collect_vec(); + let pos = args + .iter() + .position(|arg| arg == "--") + .unwrap_or(args.len()); + let slice = &args[pos..args.len()]; + let cmd = match RedoWalsCmd::try_parse_from(slice) { + Ok(cmd) => cmd, + Err(err) => { + eprintln!("{err}"); + return Ok(()); + } + }; + + let key = Key::from_hex(&cmd.key).unwrap(); + redo_wals(&cmd.input, key).await?; + + Ok(()) +} + +/// Search for a page at the given LSN in all layers of the data_dir. +/// Return the base64-encoded image and all WAL records, as well as the final reconstructed image. +#[derive(Parser)] +struct SearchKeyCmd { + #[clap(long)] + tenant_id: String, + #[clap(long)] + timeline_id: String, + #[clap(long)] + data_dir: String, + #[clap(long)] + key: String, + #[clap(long)] + lsn: String, +} + +#[tokio::test] +async fn test_search_key() -> anyhow::Result<()> { + let args = std::env::args().collect_vec(); + let pos = args + .iter() + .position(|arg| arg == "--") + .unwrap_or(args.len()); + let slice = &args[pos..args.len()]; + let cmd = match SearchKeyCmd::try_parse_from(slice) { + Ok(cmd) => cmd, + Err(err) => { + eprintln!("{err}"); + return Ok(()); + } + }; + + let tenant_id = TenantId::from_str(&cmd.tenant_id).unwrap(); + let timeline_id = TimelineId::from_str(&cmd.timeline_id).unwrap(); + let key = Key::from_hex(&cmd.key).unwrap(); + let lsn = Lsn::from_str(&cmd.lsn).unwrap(); + search_key(tenant_id, timeline_id, cmd.data_dir, key, lsn).await?; + + Ok(()) +} diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index be18b40862..15853d3614 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -43,7 +43,7 @@ use crate::controller_upcall_client::{ }; use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; -use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; +use crate::metrics::{LOCAL_DATA_LOSS_SUSPECTED, TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, @@ -538,6 +538,21 @@ pub async fn init_tenant_mgr( // Determine which tenants are to be secondary or attached, and in which generation let tenant_modes = init_load_generations(conf, &tenant_configs, resources, cancel).await?; + // Hadron local SSD check: Raise an alert if our local filesystem does not contain any tenants but the re-attach request returned tenants. + // This can happen if the PS suffered a Kubernetes node failure resulting in loss of all local data, but recovered quickly on another node + // so the Storage Controller has not had the time to move tenants out. + let data_loss_suspected = if let Some(tenant_modes) = &tenant_modes { + tenant_configs.is_empty() && !tenant_modes.is_empty() + } else { + false + }; + if data_loss_suspected { + tracing::error!( + "Local data loss suspected: no tenants found on local filesystem, but re-attach request returned tenants" + ); + } + LOCAL_DATA_LOSS_SUSPECTED.set(if data_loss_suspected { 1 } else { 0 }); + tracing::info!( "Attaching {} tenants at startup, warming up {} at a time", tenant_configs.len(), diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index ffb4717d9f..f2fbf656a6 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -141,11 +141,29 @@ pub(super) async fn upload_timeline_layer<'a>( let fs_size = usize::try_from(fs_size) .with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?; - + /* BEGIN_HADRON */ + let mut metadata = None; + match storage { + // Pass the file path as a storage metadata to minimize changes to neon. + // Otherwise, we need to change the upload interface. + GenericRemoteStorage::AzureBlob(s) => { + let block_size_mb = s.put_block_size_mb.unwrap_or(0); + if block_size_mb > 0 && fs_size > block_size_mb * 1024 * 1024 { + metadata = Some(remote_storage::StorageMetadata::from([( + "databricks_azure_put_block", + local_path.as_str(), + )])); + } + } + GenericRemoteStorage::LocalFs(_) => {} + GenericRemoteStorage::AwsS3(_) => {} + GenericRemoteStorage::Unreliable(_) => {} + }; + /* END_HADRON */ let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); storage - .upload(reader, fs_size, remote_path, None, cancel) + .upload(reader, fs_size, remote_path, metadata, cancel) .await .with_context(|| format!("upload layer from local path '{local_path}'")) } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 954dd38bb4..bcece5589a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -34,6 +34,21 @@ use crate::virtual_file::owned_buffers_io::write::FlushTaskError; /// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. static CONCURRENT_BACKGROUND_TASKS: Lazy = Lazy::new(|| { let total_threads = TOKIO_WORKER_THREADS.get(); + + /*BEGIN_HADRON*/ + // ideally we should run at least one compaction task per tenant in order to (1) maximize + // compaction throughput (2) avoid head-of-line blocking of large compactions. However doing + // that may create too many compaction tasks with lots of memory overheads. So we limit the + // number of compaction tasks based on the available CPU core count. + // Need to revisit. + // let tasks_per_thread = std::env::var("BG_TASKS_PER_THREAD") + // .ok() + // .and_then(|s| s.parse().ok()) + // .unwrap_or(4); + // let permits = usize::max(1, total_threads * tasks_per_thread); + // // assert!(permits < total_threads, "need threads for other work"); + /*END_HADRON*/ + let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0)); assert_ne!(permits, 0, "we will not be adding in permits later"); assert!(permits < total_threads, "need threads for other work"); @@ -303,9 +318,6 @@ pub(crate) fn log_compaction_error( let level = match err { e if e.is_cancel() => return, ShuttingDown => return, - Offload(_) => Level::ERROR, - AlreadyRunning(_) => Level::ERROR, - CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { let root_cause = err.root_cause(); @@ -315,7 +327,7 @@ pub(crate) fn log_compaction_error( .is_some_and(|e| e.is_stopping()); let timeline = root_cause .downcast_ref::() - .is_some_and(|e| e.is_stopping()); + .is_some_and(|e| e.is_cancel()); let buffered_writer_flush_task_canelled = root_cause .downcast_ref::() .is_some_and(|e| e.is_cancel()); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index aeced98859..6088f40669 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -40,7 +40,6 @@ use layer_manager::{ Shutdown, }; -use offload::OffloadError; use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use pageserver_api::key::{ @@ -119,7 +118,6 @@ use crate::pgdatadir_mapping::{ MAX_AUX_FILE_V2_DELTAS, MetricsUpdate, }; use crate::task_mgr::TaskKind; -use crate::tenant::config::AttachmentMode; use crate::tenant::gc_result::GcResult; use crate::tenant::layer_map::LayerMap; use crate::tenant::metadata::TimelineMetadata; @@ -202,7 +200,7 @@ pub struct TimelineResources { pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, pub basebackup_cache: Arc, - pub feature_resolver: TenantFeatureResolver, + pub feature_resolver: Arc, } pub struct Timeline { @@ -450,7 +448,7 @@ pub struct Timeline { /// A channel to send async requests to prepare a basebackup for the basebackup cache. basebackup_cache: Arc, - feature_resolver: TenantFeatureResolver, + feature_resolver: Arc, } pub(crate) enum PreviousHeatmap { @@ -587,6 +585,28 @@ pub(crate) enum PageReconstructError { MissingKey(Box), } +impl PageReconstructError { + pub(crate) fn is_cancel(&self) -> bool { + match self { + PageReconstructError::Other(_) => false, + PageReconstructError::AncestorLsnTimeout(e) => e.is_cancel(), + PageReconstructError::Cancelled => true, + PageReconstructError::WalRedo(_) => false, + PageReconstructError::MissingKey(_) => false, + } + } + #[allow(dead_code)] // we use the is_cancel + into_anyhow pattern in quite a few places, this one will follow soon enough + pub(crate) fn into_anyhow(self) -> anyhow::Error { + match self { + PageReconstructError::Other(e) => e, + PageReconstructError::AncestorLsnTimeout(e) => e.into_anyhow(), + PageReconstructError::Cancelled => anyhow::Error::new(self), + PageReconstructError::WalRedo(e) => e, + PageReconstructError::MissingKey(_) => anyhow::Error::new(self), + } + } +} + impl From for PageReconstructError { fn from(value: anyhow::Error) -> Self { // with walingest.rs many PageReconstructError are wrapped in as anyhow::Error @@ -740,17 +760,6 @@ impl std::fmt::Display for MissingKeyError { } } -impl PageReconstructError { - /// Returns true if this error indicates a tenant/timeline shutdown alike situation - pub(crate) fn is_stopping(&self) -> bool { - use PageReconstructError::*; - match self { - Cancelled => true, - Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false, - } - } -} - #[derive(thiserror::Error, Debug)] pub(crate) enum CreateImageLayersError { #[error("timeline shutting down")] @@ -953,13 +962,35 @@ pub enum WaitLsnError { Timeout(String), } +impl WaitLsnError { + pub(crate) fn is_cancel(&self) -> bool { + match self { + WaitLsnError::Shutdown => true, + WaitLsnError::BadState(timeline_state) => match timeline_state { + TimelineState::Loading => false, + TimelineState::Active => false, + TimelineState::Stopping => true, + TimelineState::Broken { .. } => false, + }, + WaitLsnError::Timeout(_) => false, + } + } + pub(crate) fn into_anyhow(self) -> anyhow::Error { + match self { + WaitLsnError::Shutdown => anyhow::Error::new(self), + WaitLsnError::BadState(_) => anyhow::Error::new(self), + WaitLsnError::Timeout(_) => anyhow::Error::new(self), + } + } +} + impl From for tonic::Status { fn from(err: WaitLsnError) -> Self { use tonic::Code; - let code = match &err { - WaitLsnError::Timeout(_) => Code::Internal, - WaitLsnError::BadState(_) => Code::Internal, - WaitLsnError::Shutdown => Code::Unavailable, + let code = if err.is_cancel() { + Code::Unavailable + } else { + Code::Internal }; tonic::Status::new(code, err.to_string()) } @@ -1086,6 +1117,26 @@ enum ImageLayerCreationOutcome { Skip, } +enum RepartitionError { + Other(anyhow::Error), + CollectKeyspace(CollectKeySpaceError), +} + +impl RepartitionError { + fn is_cancel(&self) -> bool { + match self { + RepartitionError::Other(_) => false, + RepartitionError::CollectKeyspace(e) => e.is_cancel(), + } + } + fn into_anyhow(self) -> anyhow::Error { + match self { + RepartitionError::Other(e) => e, + RepartitionError::CollectKeyspace(e) => e.into_anyhow(), + } + } +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -1772,30 +1823,31 @@ impl Timeline { existing_lease.clone() } Entry::Vacant(vacant) => { - // Reject already GC-ed LSN if we are in AttachedSingle and - // not blocked by the lsn lease deadline. + // Never allow a lease to be requested for an LSN below the applied GC cutoff. The data could have been deleted. + let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); + if lsn < *latest_gc_cutoff_lsn { + bail!( + "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", + lsn, + *latest_gc_cutoff_lsn + ); + } + + // We allow create lease for those below the planned gc cutoff if we are still within the grace period + // of GC blocking. let validate = { let conf = self.tenant_conf.load(); - conf.location.attach_mode == AttachmentMode::Single - && !conf.is_gc_blocked_by_lsn_lease_deadline() + !conf.is_gc_blocked_by_lsn_lease_deadline() }; - if init || validate { - let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); - if lsn < *latest_gc_cutoff_lsn { - bail!( - "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", - lsn, - *latest_gc_cutoff_lsn - ); - } - if lsn < planned_cutoff { - bail!( - "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", - lsn, - planned_cutoff - ); - } + // Do not allow initial lease creation to be below the planned gc cutoff. The client (compute_ctl) determines + // whether it is a initial lease creation or a renewal. + if (init || validate) && lsn < planned_cutoff { + bail!( + "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", + lsn, + planned_cutoff + ); } let dt: DateTime = valid_until.into(); @@ -2068,19 +2120,9 @@ impl Timeline { Err(CompactionError::ShuttingDown) => { // Covered by the `Err(e) if e.is_cancel()` branch. } - Err(CompactionError::AlreadyRunning(_)) => { - // Covered by the `Err(e) if e.is_cancel()` branch. - } Err(CompactionError::Other(_)) => { self.compaction_failed.store(true, AtomicOrdering::Relaxed) } - Err(CompactionError::CollectKeySpaceError(_)) => { - // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch. - self.compaction_failed.store(true, AtomicOrdering::Relaxed) - } - // Don't change the current value on offload failure or shutdown. We don't want to - // abruptly stall nor resume L0 flushes in these cases. - Err(CompactionError::Offload(_)) => {} }; result @@ -3129,7 +3171,7 @@ impl Timeline { basebackup_cache: resources.basebackup_cache, - feature_resolver: resources.feature_resolver, + feature_resolver: resources.feature_resolver.clone(), }; result.repartition_threshold = @@ -4970,7 +5012,7 @@ impl Timeline { ctx, ) .await - .map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?; + .map_err(|e| FlushLayerError::from_anyhow(self, e.into_anyhow()))?; if self.cancel.is_cancelled() { return Err(FlushLayerError::Cancelled); @@ -5220,18 +5262,18 @@ impl Timeline { partition_size: u64, flags: EnumSet, ctx: &RequestContext, - ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> { + ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), RepartitionError> { let Ok(mut guard) = self.partitioning.try_write_guard() else { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. - return Err(CompactionError::Other(anyhow!( + return Err(RepartitionError::Other(anyhow!( "repartition() called concurrently" ))); }; let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read(); if lsn < *partition_lsn { - return Err(CompactionError::Other(anyhow!( + return Err(RepartitionError::Other(anyhow!( "repartition() called with LSN going backwards, this should not happen" ))); } @@ -5252,7 +5294,10 @@ impl Timeline { )); } - let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; + let (dense_ks, sparse_ks) = self + .collect_keyspace(lsn, ctx) + .await + .map_err(RepartitionError::CollectKeyspace)?; let dense_partitioning = dense_ks.partition( &self.shard_identity, partition_size, @@ -6017,52 +6062,21 @@ impl Drop for Timeline { pub(crate) enum CompactionError { #[error("The timeline or pageserver is shutting down")] ShuttingDown, - /// Compaction tried to offload a timeline and failed - #[error("Failed to offload timeline: {0}")] - Offload(OffloadError), - /// Compaction cannot be done right now; page reconstruction and so on. - #[error("Failed to collect keyspace: {0}")] - CollectKeySpaceError(#[from] CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), - #[error("Compaction already running: {0}")] - AlreadyRunning(&'static str), } impl CompactionError { /// Errors that can be ignored, i.e., cancel and shutdown. pub fn is_cancel(&self) -> bool { - matches!( - self, + matches!(self, Self::ShuttingDown) + } + + pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self { + if err.is_cancel() { Self::ShuttingDown - | Self::AlreadyRunning(_) - | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled) - | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead( - PageReconstructError::Cancelled - )) - | Self::Offload(OffloadError::Cancelled) - ) - } - - /// Critical errors that indicate data corruption. - pub fn is_critical(&self) -> bool { - matches!( - self, - Self::CollectKeySpaceError( - CollectKeySpaceError::Decode(_) - | CollectKeySpaceError::PageRead( - PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), - ) - ) - ) - } -} - -impl From for CompactionError { - fn from(e: OffloadError) -> Self { - match e { - OffloadError::Cancelled => Self::ShuttingDown, - _ => Self::Offload(e), + } else { + Self::Other(err.into_anyhow()) } } } @@ -6765,7 +6779,7 @@ impl Timeline { } /// Reconstruct a value, using the given base image and WAL records in 'data'. - async fn reconstruct_value( + pub(crate) async fn reconstruct_value( &self, key: Key, request_lsn: Lsn, diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index ac3930fb71..c263df1eb2 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -16,7 +16,8 @@ use super::{ Timeline, }; -use crate::tenant::timeline::DeltaEntry; +use crate::pgdatadir_mapping::CollectKeySpaceError; +use crate::tenant::timeline::{DeltaEntry, RepartitionError}; use crate::walredo::RedoAttemptType; use anyhow::{Context, anyhow}; use bytes::Bytes; @@ -64,7 +65,7 @@ use crate::tenant::timeline::{ DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, drop_layer_manager_rlock, }; -use crate::tenant::{DeltaLayer, MaybeOffloaded}; +use crate::tenant::{DeltaLayer, MaybeOffloaded, PageReconstructError}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; /// Maximum number of deltas before generating an image layer in bottom-most compaction. @@ -572,7 +573,7 @@ impl GcCompactionQueue { match res { Ok(res) => Ok(res), Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown), - Err(_) => { + Err(CompactionError::Other(_)) => { // There are some cases where traditional gc might collect some layer // files causing gc-compaction cannot read the full history of the key. // This needs to be resolved in the long-term by improving the compaction @@ -591,9 +592,9 @@ impl GcCompactionQueue { timeline: &Arc, ) -> Result { let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else { - return Err(CompactionError::AlreadyRunning( - "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.", - )); + return Err(CompactionError::Other(anyhow::anyhow!( + "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue." + ))); }; let has_pending_tasks; let mut yield_for_l0 = false; @@ -1417,22 +1418,33 @@ impl Timeline { } // Suppress errors when cancelled. - Err(_) if self.cancel.is_cancelled() => {} + // + // Log other errors but continue. Failure to repartition is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline as a simple + // key-value store, ignoring the datadir layout. Log the error but continue. + // + // TODO: + // 1. shouldn't we return early here if we observe cancellation + // 2. Experiment: can we stop checking self.cancel here? + Err(_) if self.cancel.is_cancelled() => {} // TODO: try how we fare removing this branch Err(err) if err.is_cancel() => {} - - // Alert on critical errors that indicate data corruption. - Err(err) if err.is_critical() => { + Err(RepartitionError::CollectKeyspace( + e @ CollectKeySpaceError::Decode(_) + | e @ CollectKeySpaceError::PageRead( + PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), + ), + )) => { + // Alert on critical errors that indicate data corruption. critical_timeline!( self.tenant_shard_id, self.timeline_id, - "could not compact, repartitioning keyspace failed: {err:?}" + "could not compact, repartitioning keyspace failed: {e:?}" ); } - - // Log other errors. No partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline as a simple - // key-value store, ignoring the datadir layout. Log the error but continue. - Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"), + Err(e) => error!( + "could not compact, repartitioning keyspace failed: {:?}", + e.into_anyhow() + ), }; let partition_count = self.partitioning.read().0.0.parts.len(); @@ -2518,7 +2530,10 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?; + let (dense_ks, _sparse_ks) = self + .collect_keyspace(end_lsn, ctx) + .await + .map_err(CompactionError::from_collect_keyspace)?; // TODO(chi): ignore sparse_keyspace for now, compact it in the future. let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks)); diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 2dbff20ab2..33c97287c0 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -212,8 +212,12 @@ //! to the parent shard during a shard split. Eventually, the shard split task will //! shut down the parent => case (1). -use std::collections::{HashMap, hash_map}; -use std::sync::{Arc, Mutex, Weak}; +use std::collections::HashMap; +use std::collections::hash_map; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::Weak; +use std::time::Duration; use pageserver_api::shard::ShardIdentity; use tracing::{instrument, trace}; @@ -333,6 +337,44 @@ enum RoutingResult { } impl Cache { + /* BEGIN_HADRON */ + /// A wrapper of do_get to resolve the tenant shard for a get page request. + #[instrument(level = "trace", skip_all)] + pub(crate) async fn get( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + const GET_MAX_RETRIES: usize = 10; + const RETRY_BACKOFF: Duration = Duration::from_millis(100); + let mut attempt = 0; + loop { + attempt += 1; + match self + .do_get(timeline_id, shard_selector, tenant_manager) + .await + { + Ok(handle) => return Ok(handle), + Err(e) => { + // Retry on tenant manager error to handle tenant split more gracefully + if attempt < GET_MAX_RETRIES { + tracing::warn!( + "Fail to resolve tenant shard in attempt {}: {:?}. Retrying...", + attempt, + e + ); + tokio::time::sleep(RETRY_BACKOFF).await; + continue; + } else { + return Err(e); + } + } + } + } + } + /* END_HADRON */ + /// See module-level comment for details. /// /// Does NOT check for the shutdown state of [`Types::Timeline`]. @@ -341,7 +383,7 @@ impl Cache { /// and if so, return an error that causes the page service to /// close the connection. #[instrument(level = "trace", skip_all)] - pub(crate) async fn get( + async fn do_get( &mut self, timeline_id: TimelineId, shard_selector: ShardSelector, @@ -879,6 +921,7 @@ mod tests { .await .err() .expect("documented behavior: can't get new handle after shutdown"); + assert_eq!(cache.map.len(), 1, "next access cleans up the cache"); cache diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 9464f034c7..e9cf2e9aa7 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -17,8 +17,6 @@ pub(crate) enum OffloadError { Cancelled, #[error("Timeline is not archived")] NotArchived, - #[error(transparent)] - RemoteStorage(anyhow::Error), #[error("Offload or deletion already in progress")] AlreadyInProgress, #[error("Unexpected offload error: {0}")] @@ -29,7 +27,7 @@ impl From for OffloadError { fn from(e: TenantManifestError) -> Self { match e { TenantManifestError::Cancelled => Self::Cancelled, - TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e), + TenantManifestError::RemoteStorage(e) => Self::Other(e), } } } diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 9b151d2449..aba94244a3 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -182,12 +182,19 @@ pub(super) async fn connection_manager_loop_step( } }, + // If we've not received any updates from the broker from a while, are waiting for WAL + // and have no safekeeper connection or connection candidates, then it might be that + // the broker subscription is wedged. Drop the currrent subscription and re-subscribe + // with the goal of unblocking it. _ = broker_reset_interval.tick() => { - if wait_lsn_status.borrow().is_some() { - tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...") - } + let awaiting_lsn = wait_lsn_status.borrow().is_some(); + let no_candidates = connection_manager_state.wal_stream_candidates.is_empty(); + let no_connection = connection_manager_state.wal_connection.is_none(); - broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; + if awaiting_lsn && no_candidates && no_connection { + tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ..."); + broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; + } }, new_event = async { diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 29d1a31aaf..ccfad7a391 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -45,9 +45,10 @@ pub(crate) fn regenerate( let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?; // Fetch the fraction of disk space which may be used - let disk_usable_pct = match conf.disk_usage_based_eviction.clone() { - Some(e) => e.max_usage_pct, - None => Percent::new(100).unwrap(), + let disk_usable_pct = if conf.disk_usage_based_eviction.enabled { + conf.disk_usage_based_eviction.max_usage_pct + } else { + Percent::new(100).unwrap() }; // Express a static value for how many shards we may schedule on one node diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index b17b5a15f9..c6d3cafe9a 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -566,22 +566,55 @@ impl PostgresRedoManager { } } +#[cfg(test)] +pub(crate) mod harness { + use super::PostgresRedoManager; + use crate::config::PageServerConf; + use utils::{id::TenantId, shard::TenantShardId}; + + pub struct RedoHarness { + // underscored because unused, except for removal at drop + _repo_dir: camino_tempfile::Utf8TempDir, + pub manager: PostgresRedoManager, + tenant_shard_id: TenantShardId, + } + + impl RedoHarness { + pub fn new() -> anyhow::Result { + crate::tenant::harness::setup_logging(); + + let repo_dir = camino_tempfile::tempdir()?; + let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); + let conf = Box::leak(Box::new(conf)); + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); + + let manager = PostgresRedoManager::new(conf, tenant_shard_id); + + Ok(RedoHarness { + _repo_dir: repo_dir, + manager, + tenant_shard_id, + }) + } + pub fn span(&self) -> tracing::Span { + tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) + } + } +} + #[cfg(test)] mod tests { use std::str::FromStr; use bytes::Bytes; use pageserver_api::key::Key; - use pageserver_api::shard::TenantShardId; use postgres_ffi::PgMajorVersion; use tracing::Instrument; - use utils::id::TenantId; use utils::lsn::Lsn; use wal_decoder::models::record::NeonWalRecord; - use super::PostgresRedoManager; - use crate::config::PageServerConf; use crate::walredo::RedoAttemptType; + use crate::walredo::harness::RedoHarness; #[tokio::test] async fn test_ping() { @@ -692,33 +725,4 @@ mod tests { ) ] } - - struct RedoHarness { - // underscored because unused, except for removal at drop - _repo_dir: camino_tempfile::Utf8TempDir, - manager: PostgresRedoManager, - tenant_shard_id: TenantShardId, - } - - impl RedoHarness { - fn new() -> anyhow::Result { - crate::tenant::harness::setup_logging(); - - let repo_dir = camino_tempfile::tempdir()?; - let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); - let conf = Box::leak(Box::new(conf)); - let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - - let manager = PostgresRedoManager::new(conf, tenant_shard_id); - - Ok(RedoHarness { - _repo_dir: repo_dir, - manager, - tenant_shard_id, - }) - } - fn span(&self) -> tracing::Span { - tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) - } - } } diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index 7c84be7d15..bd53855eab 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -65,6 +65,7 @@ #include "port/pg_iovec.h" #include "postmaster/interrupt.h" #include "replication/walsender.h" +#include "storage/ipc.h" #include "utils/timeout.h" #include "bitmap.h" @@ -412,6 +413,47 @@ compact_prefetch_buffers(void) return false; } +/* + * Check that prefetch response matches the slot + */ +static void +check_getpage_response(PrefetchRequest* slot, NeonResponse* resp) +{ + if (resp->tag != T_NeonGetPageResponse && resp->tag != T_NeonErrorResponse) + { + neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=%ld, ring_flush=%ld, ring_unused=%ld", + resp->tag, MyPState->ring_receive, MyPState->ring_flush, MyPState->ring_unused); + } + if (neon_protocol_version >= 3) + { + NRelFileInfo rinfo = BufTagGetNRelFileInfo(slot->buftag); + if (resp->tag == T_NeonGetPageResponse) + { + NeonGetPageResponse * getpage_resp = (NeonGetPageResponse *)resp; + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since || + !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || + getpage_resp->req.forknum != slot->buftag.forkNum || + getpage_resp->req.blkno != slot->buftag.blockNum) + { + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Receive unexpected getpage response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), slot->buftag.forkNum, slot->buftag.blockNum); + } + } + else if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); + } + } +} + /* * If there might be responses still in the TCP buffer, then we should try to * use those, to reduce any TCP backpressure on the OS/PS side. @@ -446,15 +488,18 @@ communicator_prefetch_pump_state(void) if (response == NULL) break; + check_getpage_response(slot, response); + /* The slot should still be valid */ if (slot->status != PRFS_REQUESTED || slot->response != NULL || slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(slot->shard_no, ERROR, + { + neon_shard_log(slot->shard_no, PANIC, "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", slot->status, slot->response, (long) slot->my_ring_index, (long) MyPState->ring_receive); - + } /* update prefetch state */ MyPState->n_responses_buffered += 1; MyPState->n_requests_inflight -= 1; @@ -593,6 +638,21 @@ readahead_buffer_resize(int newsize, void *extra) } +/* + * Callback to be called on backend exit to ensure correct state of compute-PS communication + * in case of backend cancel + */ +static void +prefetch_on_exit(int code, Datum arg) +{ + if (code != 0) /* do disconnect only on abnormal backend termination */ + { + shardno_t shard_no = DatumGetInt32(arg); + prefetch_on_ps_disconnect(); + page_server->disconnect(shard_no); + } +} + /* * Make sure that there are no responses still in the buffer. @@ -605,6 +665,11 @@ consume_prefetch_responses(void) { if (MyPState->ring_receive < MyPState->ring_unused) prefetch_wait_for(MyPState->ring_unused - 1); + /* + * We know for sure we're not working on any prefetch pages after + * this. + */ + END_PREFETCH_RECEIVE_WORK(); } static void @@ -722,10 +787,12 @@ prefetch_read(PrefetchRequest *slot) if (slot->status != PRFS_REQUESTED || slot->response != NULL || slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(slot->shard_no, ERROR, + { + neon_shard_log(slot->shard_no, PANIC, "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu", slot->status, slot->response, (long)slot->my_ring_index, (long)MyPState->ring_receive); + } /* * Copy the request info so that if an error happens and the prefetch @@ -741,14 +808,18 @@ prefetch_read(PrefetchRequest *slot) MemoryContextSwitchTo(old); if (response) { + check_getpage_response(slot, response); + /* The slot should still be valid */ if (slot->status != PRFS_REQUESTED || slot->response != NULL || slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(shard_no, ERROR, + { + neon_shard_log(shard_no, PANIC, "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", slot->status, slot->response, (long) slot->my_ring_index, (long) MyPState->ring_receive); + } /* update prefetch state */ MyPState->n_responses_buffered += 1; @@ -820,11 +891,10 @@ communicator_prefetch_receive(BufferTag tag) void prefetch_on_ps_disconnect(void) { - bool save_readpage_reentrant_guard = readpage_reentrant_guard; MyPState->ring_flush = MyPState->ring_unused; - /* Prohibit callig of prefetch_pump_state */ - START_PREFETCH_RECEIVE_WORK(); + /* Nothing should cancel disconnect: we should not leave connection in opaque state */ + HOLD_INTERRUPTS(); while (MyPState->ring_receive < MyPState->ring_unused) { @@ -854,9 +924,6 @@ prefetch_on_ps_disconnect(void) MyNeonCounters->getpage_prefetch_discards_total += 1; } - /* Restore guard */ - readpage_reentrant_guard = save_readpage_reentrant_guard; - /* * We can have gone into retry due to network error, so update stats with * the latest available @@ -865,6 +932,8 @@ prefetch_on_ps_disconnect(void) MyPState->n_requests_inflight; MyNeonCounters->getpage_prefetches_buffered = MyPState->n_responses_buffered; + + RESUME_INTERRUPTS(); } /* @@ -1027,16 +1096,11 @@ communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumbe /* * Ignore errors */ - if (slot->response->tag != T_NeonGetPageResponse) + if (slot->response->tag == T_NeonErrorResponse) { - if (slot->response->tag != T_NeonErrorResponse) - { - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag); - } continue; } + Assert(slot->response->tag == T_NeonGetPageResponse); /* checked by check_getpage_response when response was assigned to the slot */ memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); @@ -1351,7 +1415,7 @@ equal_requests(NeonRequest* a, NeonRequest* b) static NeonResponse * page_server_request(void const *req) { - NeonResponse *resp; + NeonResponse *resp = NULL; BufferTag tag = {0}; shardno_t shard_no; @@ -1371,7 +1435,7 @@ page_server_request(void const *req) tag.blockNum = ((NeonGetPageRequest *) req)->blkno; break; default: - neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); + neon_log(PANIC, "Unexpected request tag: %d", messageTag(req)); } shard_no = get_shard_number(&tag); @@ -1384,9 +1448,12 @@ page_server_request(void const *req) shard_no = 0; } - do + consume_prefetch_responses(); + + PG_TRY(); { - PG_TRY(); + before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + do { while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no)) @@ -1394,30 +1461,24 @@ page_server_request(void const *req) /* do nothing */ } MyNeonCounters->pageserver_open_requests++; - consume_prefetch_responses(); resp = page_server->receive(shard_no); MyNeonCounters->pageserver_open_requests--; - } - PG_CATCH(); - { - /* - * Cancellation in this code needs to be handled better at some - * point, but this currently seems fine for now. - */ - page_server->disconnect(shard_no); - MyNeonCounters->pageserver_open_requests = 0; + } while (resp == NULL); + cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + } + PG_CATCH(); + { + cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + /* Nothing should cancel disconnect: we should not leave connection in opaque state */ + HOLD_INTERRUPTS(); + page_server->disconnect(shard_no); + MyNeonCounters->pageserver_open_requests = 0; + RESUME_INTERRUPTS(); - /* - * We know for sure we're not working on any prefetch pages after - * this. - */ - END_PREFETCH_RECEIVE_WORK(); + PG_RE_THROW(); + } + PG_END_TRY(); - PG_RE_THROW(); - } - PG_END_TRY(); - - } while (resp == NULL); return resp; } @@ -1502,7 +1563,7 @@ nm_pack_request(NeonRequest *msg) case T_NeonDbSizeResponse: case T_NeonGetSlruSegmentResponse: default: - neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag); + neon_log(PANIC, "unexpected neon message tag 0x%02x", msg->tag); break; } return s; @@ -1654,7 +1715,7 @@ nm_unpack_response(StringInfo s) case T_NeonDbSizeRequest: case T_NeonGetSlruSegmentRequest: default: - neon_log(ERROR, "unexpected neon message tag 0x%02x", tag); + neon_log(PANIC, "unexpected neon message tag 0x%02x", tag); break; } @@ -1983,7 +2044,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r !RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) || exists_resp->req.forknum != request.forknum) { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); @@ -2014,7 +2075,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r break; default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); } @@ -2158,6 +2219,7 @@ Retry: Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0); Assert(hashkey.buftag.blockNum == base_blockno + i); + /* We already checked that response match request when storing it in slot */ resp = slot->response; switch (resp->tag) @@ -2165,21 +2227,6 @@ Retry: case T_NeonGetPageResponse: { NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp; - if (neon_protocol_version >= 3) - { - if (resp->reqid != slot->reqid || - resp->lsn != slot->request_lsns.request_lsn || - resp->not_modified_since != slot->request_lsns.not_modified_since || - !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || - getpage_resp->req.forknum != forkNum || - getpage_resp->req.blkno != base_blockno + i) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, - slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i); - } - } memcpy(buffer, getpage_resp->page, BLCKSZ); /* @@ -2192,17 +2239,6 @@ Retry: break; } case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (resp->reqid != slot->reqid || - resp->lsn != slot->request_lsns.request_lsn || - resp->not_modified_since != slot->request_lsns.not_modified_since) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); - } - } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", @@ -2257,7 +2293,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns * !RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) || relsize_resp->req.forknum != forknum) { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); @@ -2288,7 +2324,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns * break; default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); } @@ -2327,7 +2363,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns) if (!equal_requests(resp, &request.hdr) || dbsize_resp->req.dbNode != dbNode) { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); @@ -2356,7 +2392,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns) break; default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); } @@ -2372,7 +2408,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re { int n_blocks; shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ - NeonResponse *resp; + NeonResponse *resp = NULL; NeonGetSlruSegmentRequest request; request = (NeonGetSlruSegmentRequest) { @@ -2383,14 +2419,29 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re .segno = segno }; - do + consume_prefetch_responses(); + + PG_TRY(); { - while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); + before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + do + { + while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); + resp = page_server->receive(shard_no); + } while (resp == NULL); + cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + } + PG_CATCH(); + { + cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + /* Nothing should cancel disconnect: we should not leave connection in opaque state */ + HOLD_INTERRUPTS(); + page_server->disconnect(shard_no); + RESUME_INTERRUPTS(); - consume_prefetch_responses(); - - resp = page_server->receive(shard_no); - } while (resp == NULL); + PG_RE_THROW(); + } + PG_END_TRY(); switch (resp->tag) { @@ -2403,7 +2454,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re slru_resp->req.kind != kind || slru_resp->req.segno != segno) { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno); @@ -2435,7 +2486,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re break; default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); } diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 7a21e4ecee..9d1a3d4358 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -10,7 +10,7 @@ use measured::{ Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, MetricGroup, }; -use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; +use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLogVec}; use tokio::time::{self, Instant}; use crate::control_plane::messages::ColdStartInfo; @@ -36,7 +36,6 @@ impl Metrics { metrics.proxy.redis_errors_total.init_all_dense(); metrics.proxy.redis_events_count.init_all_dense(); metrics.proxy.retries_metric.init_all_dense(); - metrics.proxy.invalid_endpoints_total.init_all_dense(); metrics.proxy.connection_failures_total.init_all_dense(); SELF.set(metrics) @@ -80,11 +79,6 @@ pub struct ProxyMetrics { )] pub console_request_latency: HistogramVec, - /// Time it takes to acquire a token to call console plane. - // largest bucket = 3^16 * 0.05ms = 2.15s - #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))] - pub control_plane_token_acquire_seconds: Histogram<16>, - /// Size of the HTTP request body lengths. // smallest bucket = 16 bytes // largest bucket = 4^12 * 16 bytes = 256MB @@ -98,19 +92,10 @@ pub struct ProxyMetrics { /// Number of opened connections to a database. pub http_pool_opened_connections: Gauge, - /// Number of cache hits/misses for allowed ips. - pub allowed_ips_cache_misses: CounterVec>, - /// Number of allowed ips #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_ips_number: Histogram<10>, - /// Number of cache hits/misses for VPC endpoint IDs. - pub vpc_endpoint_id_cache_stats: CounterVec>, - - /// Number of cache hits/misses for access blocker flags. - pub access_blocker_flags_cache_stats: CounterVec>, - /// Number of allowed VPC endpoints IDs #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_vpc_endpoint_ids: Histogram<10>, @@ -139,21 +124,12 @@ pub struct ProxyMetrics { /// Number of TLS handshake failures pub tls_handshake_failures: Counter, - /// Number of connection requests affected by authentication rate limits - pub requests_auth_rate_limits_total: Counter, - /// HLL approximate cardinality of endpoints that are connecting pub connecting_endpoints: HyperLogLogVec, 32>, /// Number of endpoints affected by errors of a given classification pub endpoints_affected_by_errors: HyperLogLogVec, 32>, - /// Number of endpoints affected by authentication rate limits - pub endpoints_auth_rate_limits: HyperLogLog<32>, - - /// Number of invalid endpoints (per protocol, per rejected). - pub invalid_endpoints_total: CounterVec, - /// Number of retries (per outcome, per retry_type). #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))] pub retries_metric: HistogramVec, diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 3797ac39d1..72a436e25f 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -561,6 +561,20 @@ impl InterpretedWalReader { // Update internal and external state, then reset the WAL stream // if required. let senders = self.shard_senders.entry(shard_id).or_default(); + + // Clean up any shard senders that have dropped out before adding the new + // one. This avoids a build up of dead senders. + senders.retain(|sender| { + let closed = sender.tx.is_closed(); + + if closed { + let sender_id = ShardSenderId::new(shard_id, sender.sender_id); + tracing::info!("Removed shard sender {}", sender_id); + } + + !closed + }); + let new_sender_id = match senders.last() { Some(sender) => sender.sender_id.next(), None => SenderId::first() diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 9360225396..ed6643d641 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1677,7 +1677,21 @@ impl Service { .collect::>>()?; let safekeepers: HashMap = safekeepers.into_iter().map(|n| (n.get_id(), n)).collect(); - tracing::info!("Loaded {} safekeepers from database.", safekeepers.len()); + let count_policy = |policy| { + safekeepers + .iter() + .filter(|sk| sk.1.scheduling_policy() == policy) + .count() + }; + let active_sk_count = count_policy(SkSchedulingPolicy::Active); + let activating_sk_count = count_policy(SkSchedulingPolicy::Activating); + let pause_sk_count = count_policy(SkSchedulingPolicy::Pause); + let decom_sk_count = count_policy(SkSchedulingPolicy::Decomissioned); + tracing::info!( + "Loaded {} safekeepers from database. Active {active_sk_count}, activating {activating_sk_count}, \ + paused {pause_sk_count}, decomissioned {decom_sk_count}.", + safekeepers.len() + ); metrics::METRICS_REGISTRY .metrics_group .storage_controller_safekeeper_nodes @@ -1969,6 +1983,14 @@ impl Service { } }); + // Check that there is enough safekeepers configured that we can create new timelines + let test_sk_res = this.safekeepers_for_new_timeline().await; + tracing::info!( + timeline_safekeeper_count = config.timeline_safekeeper_count, + timelines_onto_safekeepers = config.timelines_onto_safekeepers, + "viability test result (test timeline creation on safekeepers): {test_sk_res:?}", + ); + Ok(this) } @@ -7208,6 +7230,12 @@ impl Service { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); + // Calculate a schedule context here to avoid borrow checker issues. + let mut schedule_context = ScheduleContext::default(); + for (_, shard) in tenants.range(TenantShardId::tenant_range(tid.tenant_id)) { + schedule_context.avoid(&shard.intent.all_pageservers()); + } + let tenant_shard = match tenants.get_mut(&tid) { Some(tenant_shard) => tenant_shard, None => { @@ -7233,9 +7261,6 @@ impl Service { } if tenant_shard.deref_node(node_id) { - // TODO(ephemeralsad): we should process all shards in a tenant at once, so - // we can avoid settling the tenant unevenly. - let mut schedule_context = ScheduleContext::new(ScheduleMode::Normal); if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) { tracing::error!( "Refusing to delete node, shard {} can't be rescheduled: {e}", diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 294c52321b..1d278095ce 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -2,11 +2,12 @@ from __future__ import annotations import urllib.parse from enum import StrEnum -from typing import TYPE_CHECKING, final +from typing import TYPE_CHECKING, Any, final import requests from requests.adapters import HTTPAdapter from requests.auth import AuthBase +from requests.exceptions import ReadTimeout from typing_extensions import override from fixtures.log_helper import log @@ -102,6 +103,18 @@ class EndpointHttpClient(requests.Session): wait_until(offloaded) + def promote(self, safekeepers_lsn: dict[str, Any], disconnect: bool = False): + url = f"http://localhost:{self.external_port}/promote" + if disconnect: + try: # send first request to start promote and disconnect + self.post(url, data=safekeepers_lsn, timeout=0.001) + except ReadTimeout: + pass # wait on second request which returns on promotion finish + res = self.post(url, data=safekeepers_lsn) + res.raise_for_status() + json: dict[str, str] = res.json() + return json + def database_schema(self, database: str): res = self.get( f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}", diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 1dd4fe8316..6e600b5a86 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -159,6 +159,9 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = ( ) PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( + # BEGIN_HADRON + "pageserver_active_storage_operations_count", + # END_HADRON "pageserver_current_logical_size", "pageserver_resident_physical_size", "pageserver_io_operations_bytes_total", diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 6a715c4b93..0e4dd571c0 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -111,6 +111,14 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*stalling layer flushes for compaction backpressure.*", ".*layer roll waiting for flush due to compaction backpressure.*", ".*BatchSpanProcessor.*", + # Can happen in tests that purposely wipe pageserver "local disk" data. + ".*Local data loss suspected.*", + # Too many frozen layers error is normal during intensive benchmarks + ".*too many frozen layers.*", + # Transient errors when resolving tenant shards by page service + ".*Fail to resolve tenant shard in attempt.*", + # Expected warnings when pageserver has not refreshed GC info yet + ".*pitr LSN/interval not found, skipping force image creation LSN calculation.*", ".*No broker updates received for a while.*", *( [ diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index d9037f2d08..79cfba8da6 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1247,3 +1247,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter): ) self.verbose_error(res) return res.json() + + def force_refresh_feature_flag(self, tenant_id: TenantId | TenantShardId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/force_refresh_feature_flag", + ) + self.verbose_error(res) + return res.json() diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index bf998a2a0a..8e7055ef78 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -71,7 +71,13 @@ def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_man n_clients: int, ): setup_and_run_pagebench_benchmark( - neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients + neon_env_builder, + zenbenchmark, + pg_bin, + n_tenants, + pgbench_scale, + duration, + n_clients, ) @@ -86,7 +92,8 @@ def setup_and_run_pagebench_benchmark( ): def record(metric, **kwargs): zenbenchmark.record( - metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs + metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", + **kwargs, ) params: dict[str, tuple[Any, dict[str, Any]]] = {} @@ -104,7 +111,7 @@ def setup_and_run_pagebench_benchmark( # configure cache sizes like in prod page_cache_size = 16384 max_file_descriptors = 500000 - neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{max_usage_pct=99, min_avail_bytes=0, period = '999y'}}" + neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{enabled = false}}" tracing_config = PageserverTracingConfig( sampling_ratio=(0, 1000), @@ -120,7 +127,10 @@ def setup_and_run_pagebench_benchmark( page_cache_size * 8192, {"unit": "byte"}, ), - "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + "pageserver_config_override.max_file_descriptors": ( + max_file_descriptors, + {"unit": ""}, + ), "pageserver_config_override.sampling_ratio": (ratio, {"unit": ""}), } ) diff --git a/test_runner/regress/test_feature_flag.py b/test_runner/regress/test_feature_flag.py index 2712d13dcc..c6c192b6f1 100644 --- a/test_runner/regress/test_feature_flag.py +++ b/test_runner/regress/test_feature_flag.py @@ -49,3 +49,12 @@ def test_feature_flag(neon_env_builder: NeonEnvBuilder): env.initial_tenant, "test-feature-flag" )["result"] ) + + env.pageserver.http_client().force_refresh_feature_flag(env.initial_tenant) + + # Check if the properties exist + result = env.pageserver.http_client().evaluate_feature_flag_multivariate( + env.initial_tenant, "test-feature-flag" + ) + assert "tenant_remote_size_mb" in result["properties"] + assert "tenant_id" in result["properties"] diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py index 4486901bae..1f26269f40 100644 --- a/test_runner/regress/test_replica_promotes.py +++ b/test_runner/regress/test_replica_promotes.py @@ -1,29 +1,51 @@ """ -File with secondary->primary promotion testing. - -This far, only contains a test that we don't break and that the data is persisted. +Secondary -> primary promotion testing """ +from enum import StrEnum from typing import cast import psycopg2 +import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup -from fixtures.pg_version import PgVersion +from fixtures.utils import USE_LFC +from psycopg2.extensions import cursor as Cursor from pytest import raises def stop_and_check_lsn(ep: Endpoint, expected_lsn: Lsn | None): ep.stop(mode="immediate-terminate") lsn = ep.terminate_flush_lsn - if expected_lsn is not None: + assert (lsn is not None) == (expected_lsn is not None), f"{lsn=}, {expected_lsn=}" + if lsn is not None: assert lsn >= expected_lsn, f"{expected_lsn=} < {lsn=}" - else: - assert lsn == expected_lsn, f"{expected_lsn=} != {lsn=}" -def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): +def get_lsn_triple(cur: Cursor) -> tuple[str, str, str]: + cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + return cast("tuple[str, str, str]", cur.fetchone()) + + +class PromoteMethod(StrEnum): + COMPUTE_CTL = "compute-ctl" + POSTGRES = "postgres" + + +METHOD_OPTIONS = [e for e in PromoteMethod] +METHOD_IDS = [e.value for e in PromoteMethod] + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +@pytest.mark.parametrize("method", METHOD_OPTIONS, ids=METHOD_IDS) +def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod): """ Test that a replica safely promotes, and can commit data updates which show up when the primary boots up after the promoted secondary endpoint @@ -38,29 +60,26 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): with primary.connect() as primary_conn: primary_cur = primary_conn.cursor() + primary_cur.execute("create extension neon") primary_cur.execute( "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)" ) primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)") - primary_cur.execute( - """ - SELECT pg_current_wal_insert_lsn(), - pg_current_wal_lsn(), - pg_current_wal_flush_lsn() - """ - ) - lsn_triple = cast("tuple[str, str, str]", primary_cur.fetchone()) + + lsn_triple = get_lsn_triple(primary_cur) log.info(f"Primary: Current LSN after workload is {lsn_triple}") expected_primary_lsn: Lsn = Lsn(lsn_triple[2]) primary_cur.execute("show neon.safekeepers") safekeepers = primary_cur.fetchall()[0][0] - wait_replica_caughtup(primary, secondary) + if method == PromoteMethod.COMPUTE_CTL: + primary.http_client().offload_lfc() + else: + wait_replica_caughtup(primary, secondary) with secondary.connect() as secondary_conn: secondary_cur = secondary_conn.cursor() secondary_cur.execute("select count(*) from t") - assert secondary_cur.fetchone() == (100,) with raises(psycopg2.Error): @@ -71,28 +90,30 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): secondary_cur.execute("select count(*) from t") assert secondary_cur.fetchone() == (100,) + primary_endpoint_id = primary.endpoint_id stop_and_check_lsn(primary, expected_primary_lsn) # Reconnect to the secondary to make sure we get a read-write connection promo_conn = secondary.connect() promo_cur = promo_conn.cursor() - promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'") - promo_cur.execute("select pg_reload_conf()") + if method == PromoteMethod.COMPUTE_CTL: + client = secondary.http_client() + client.prewarm_lfc(primary_endpoint_id) + # control plane knows safekeepers, simulate it by querying primary + assert (lsn := primary.terminate_flush_lsn) + safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn} + assert client.promote(safekeepers_lsn)["status"] == "completed" + else: + promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'") + promo_cur.execute("select pg_reload_conf()") + promo_cur.execute("SELECT * FROM pg_promote()") + assert promo_cur.fetchone() == (True,) - promo_cur.execute("SELECT * FROM pg_promote()") - assert promo_cur.fetchone() == (True,) - promo_cur.execute( - """ - SELECT pg_current_wal_insert_lsn(), - pg_current_wal_lsn(), - pg_current_wal_flush_lsn() - """ - ) - log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}") + lsn_triple = get_lsn_triple(promo_cur) + log.info(f"Secondary: LSN after promotion is {lsn_triple}") # Reconnect to the secondary to make sure we get a read-write connection - with secondary.connect() as new_primary_conn: - new_primary_cur = new_primary_conn.cursor() + with secondary.connect() as conn, conn.cursor() as new_primary_cur: new_primary_cur.execute("select count(*) from t") assert new_primary_cur.fetchone() == (100,) @@ -101,43 +122,34 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): ) assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)] - new_primary_cur = new_primary_conn.cursor() + new_primary_cur = conn.cursor() new_primary_cur.execute("select payload from t") assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)] new_primary_cur.execute("select count(*) from t") assert new_primary_cur.fetchone() == (200,) - new_primary_cur.execute( - """ - SELECT pg_current_wal_insert_lsn(), - pg_current_wal_lsn(), - pg_current_wal_flush_lsn() - """ - ) - log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}") - with secondary.connect() as second_viewpoint_conn: - new_primary_cur = second_viewpoint_conn.cursor() + lsn_triple = get_lsn_triple(new_primary_cur) + log.info(f"Secondary: LSN after workload is {lsn_triple}") + expected_promoted_lsn = Lsn(lsn_triple[2]) + + with secondary.connect() as conn, conn.cursor() as new_primary_cur: new_primary_cur.execute("select payload from t") assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)] - # wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline) - - # secondaries don't sync safekeepers on finish so LSN will be None - stop_and_check_lsn(secondary, None) + if method == PromoteMethod.COMPUTE_CTL: + # compute_ctl's /promote switches replica type to Primary so it syncs + # safekeepers on finish + stop_and_check_lsn(secondary, expected_promoted_lsn) + else: + # on testing postgres, we don't update replica type, secondaries don't + # sync so lsn should be None + stop_and_check_lsn(secondary, None) primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary2") - with primary.connect() as new_primary: - new_primary_cur = new_primary.cursor() - new_primary_cur.execute( - """ - SELECT pg_current_wal_insert_lsn(), - pg_current_wal_lsn(), - pg_current_wal_flush_lsn() - """ - ) - lsn_triple = cast("tuple[str, str, str]", new_primary_cur.fetchone()) + with primary.connect() as new_primary, new_primary.cursor() as new_primary_cur: + lsn_triple = get_lsn_triple(new_primary_cur) expected_primary_lsn = Lsn(lsn_triple[2]) log.info(f"New primary: Boot LSN is {lsn_triple}") @@ -146,5 +158,39 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)") new_primary_cur.execute("select count(*) from t") assert new_primary_cur.fetchone() == (300,) - stop_and_check_lsn(primary, expected_primary_lsn) + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv): + """ + Test that if a handler disconnects from /promote route of compute_ctl, promotion still happens + once, and no error is thrown + """ + env: NeonEnv = neon_simple_env + primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + with primary.connect() as conn, conn.cursor() as cur: + cur.execute("create extension neon") + cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)") + cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)") + cur.execute("show neon.safekeepers") + safekeepers = cur.fetchall()[0][0] + + primary.http_client().offload_lfc() + primary_endpoint_id = primary.endpoint_id + primary.stop(mode="immediate-terminate") + assert (lsn := primary.terminate_flush_lsn) + + client = secondary.http_client() + client.prewarm_lfc(primary_endpoint_id) + safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn} + assert client.promote(safekeepers_lsn, disconnect=True)["status"] == "completed" + + with secondary.connect() as conn, conn.cursor() as cur: + cur.execute("select count(*) from t") + assert cur.fetchone() == (100,) + cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload") + cur.execute("select count(*) from t") + assert cur.fetchone() == (200,) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 93c621f564..8ff767eca4 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1,8 +1,11 @@ from __future__ import annotations import os +import random +import threading import time from collections import defaultdict +from threading import Event from typing import TYPE_CHECKING, Any import pytest @@ -1505,6 +1508,171 @@ def test_sharding_split_failures( env.storage_controller.consistency_check() +@pytest.mark.skip(reason="The backpressure change has not been merged yet.") +def test_back_pressure_during_split(neon_env_builder: NeonEnvBuilder): + """ + Test backpressure can ignore new shards during tenant split so that if we abort the split, + PG can continue without being blocked. + """ + DBNAME = "regression" + + init_shard_count = 4 + neon_env_builder.num_pageservers = init_shard_count + stripe_size = 32 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + + env.storage_controller.allowed_errors.extend( + [ + # All split failures log a warning when then enqueue the abort operation + ".*Enqueuing background abort.*", + # Tolerate any error lots that mention a failpoint + ".*failpoint.*", + ] + ) + + endpoint = env.endpoints.create( + "main", + config_lines=[ + "max_replication_write_lag = 1MB", + "databricks.max_wal_mb_per_second = 1", + "neon.max_cluster_size = 10GB", + ], + ) + endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created. + endpoint.start() + + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") + + endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);") + write_done = Event() + + def write_data(write_done): + while not write_done.is_set(): + endpoint.safe_psql( + "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False + ) + log.info("write_data thread exiting") + + writer_thread = threading.Thread(target=write_data, args=(write_done,)) + writer_thread.start() + + env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)")) + # split the tenant + with pytest.raises(StorageControllerApiException): + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16) + + write_done.set() + writer_thread.join() + + # writing more data to page servers after split is aborted + for _i in range(5000): + endpoint.safe_psql( + "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False + ) + + # wait until write lag becomes 0 + def check_write_lag_is_zero(): + res = endpoint.safe_psql( + """ + SELECT + pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag + FROM neon.backpressure_lsns(); + """, + dbname="databricks_system", + log_query=False, + ) + log.info(f"received_lsn_lag = {res[0][0]}") + assert res[0][0] == 0 + + wait_until(check_write_lag_is_zero) + endpoint.stop_and_destroy() + + +# BEGIN_HADRON +def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder): + """ + Tests that page service is able to resolve the correct shard during tenant split without causing query errors + """ + DBNAME = "regression" + WORKER_THREADS = 16 + ROW_COUNT = 10000 + + init_shard_count = 4 + neon_env_builder.num_pageservers = 1 + stripe_size = 16 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + + env.storage_controller.allowed_errors.extend( + [ + # All split failures log a warning when then enqueue the abort operation + ".*Enqueuing background abort.*", + # Tolerate any error lots that mention a failpoint + ".*failpoint.*", + ] + ) + + endpoint = env.endpoints.create("main") + endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created. + endpoint.start() + + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") + + # generate 10MB of data + endpoint.safe_psql( + f"CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, {ROW_COUNT}) s;" + ) + read_done = Event() + + def read_data(read_done): + i = 0 + while not read_done.is_set() or i < 10: + endpoint.safe_psql( + f"SELECT * FROM usertable where KEY = {random.randint(1, ROW_COUNT)}", + log_query=False, + ) + i += 1 + log.info(f"read_data thread exiting. Executed {i} queries.") + + reader_threads = [] + for _i in range(WORKER_THREADS): + reader_thread = threading.Thread(target=read_data, args=(read_done,)) + reader_thread.start() + reader_threads.append(reader_thread) + + env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)")) + # split the tenant + with pytest.raises(StorageControllerApiException): + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16) + + # wait until abort is done + def check_tenant_status(): + active_count = 0 + for i in range(init_shard_count): + status = env.pageserver.http_client().tenant_status( + TenantShardId(env.initial_tenant, i, init_shard_count) + ) + if status["state"]["slug"] == "Active": + active_count += 1 + assert active_count == 4 + + wait_until(check_tenant_status) + + read_done.set() + for thread in reader_threads: + thread.join() + + endpoint.stop() + + +# END_HADRON + + def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): """ Check a scenario when one of the shards is much slower than others. diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py index 83bebc19be..63772f7cd4 100644 --- a/test_runner/regress/test_subscriber_branching.py +++ b/test_runner/regress/test_subscriber_branching.py @@ -332,7 +332,7 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv): last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();") - def start_publisher_workload(table_num: int, duration: int): + def start_publisher_workload(i: int, duration: int): start = time.time() with endpoint.cursor(dbname="publisher_db") as cur: while time.time() - start < duration: