mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-13 00:12:54 +00:00
Merge branch 'main' into amasterov/random-ops-add
# Conflicts: # test_runner/random_ops/test_random_ops.py
This commit is contained in:
18
.github/workflows/build_and_test.yml
vendored
18
.github/workflows/build_and_test.yml
vendored
@@ -87,6 +87,24 @@ jobs:
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
secrets: inherit
|
||||
|
||||
lint-openapi-spec:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [ meta, check-permissions ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- run: make lint-openapi-spec
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ meta, check-permissions, build-build-tools-image ]
|
||||
# No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
|
||||
22
Cargo.lock
generated
22
Cargo.lock
generated
@@ -1348,6 +1348,7 @@ dependencies = [
|
||||
"p256 0.13.2",
|
||||
"pageserver_page_api",
|
||||
"postgres",
|
||||
"postgres-types",
|
||||
"postgres_initdb",
|
||||
"postgres_versioninfo",
|
||||
"regex",
|
||||
@@ -4339,6 +4340,7 @@ dependencies = [
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"base64 0.22.1",
|
||||
"bincode",
|
||||
"bit_field",
|
||||
"byteorder",
|
||||
@@ -4492,6 +4494,24 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"compute_api",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"pageserver_page_api",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tonic 0.13.1",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
@@ -5684,6 +5704,8 @@ dependencies = [
|
||||
"azure_identity",
|
||||
"azure_storage",
|
||||
"azure_storage_blobs",
|
||||
"base64 0.22.1",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
|
||||
@@ -8,6 +8,7 @@ members = [
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/client_grpc",
|
||||
"pageserver/pagebench",
|
||||
"pageserver/page_api",
|
||||
"proxy",
|
||||
|
||||
9
Makefile
9
Makefile
@@ -220,6 +220,15 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
|
||||
setup-pre-commit-hook:
|
||||
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||
|
||||
.PHONY: lint-openapi-spec
|
||||
lint-openapi-spec:
|
||||
# operation-2xx-response: pageserver timeline delete returns 404 on success
|
||||
find . -iname "openapi_spec.y*ml" -exec\
|
||||
docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\
|
||||
--skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\
|
||||
--skip-rule=no-server-example.com --skip-rule=operation-2xx-response\
|
||||
lint {} \+
|
||||
|
||||
# Targets for building PostgreSQL are defined in postgres.mk.
|
||||
#
|
||||
# But if the caller has indicated that PostgreSQL is already
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
disallowed-methods = [
|
||||
"tokio::task::block_in_place",
|
||||
|
||||
# Allow this for now, to deny it later once we stop using Handle::block_on completely
|
||||
# "tokio::runtime::Handle::block_on",
|
||||
# use tokio_epoll_uring_ext instead
|
||||
"tokio_epoll_uring::thread_local_system",
|
||||
|
||||
# tokio-epoll-uring:
|
||||
# - allow-invalid because the method doesn't exist on macOS
|
||||
{ path = "tokio_epoll_uring::thread_local_system", replacement = "tokio_epoll_uring_ext module inside pageserver crate", allow-invalid = true }
|
||||
]
|
||||
|
||||
disallowed-macros = [
|
||||
|
||||
@@ -66,7 +66,7 @@ url.workspace = true
|
||||
uuid.workspace = true
|
||||
walkdir.workspace = true
|
||||
x509-cert.workspace = true
|
||||
|
||||
postgres-types.workspace = true
|
||||
postgres_versioninfo.workspace = true
|
||||
postgres_initdb.workspace = true
|
||||
compute_api.workspace = true
|
||||
|
||||
@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
|
||||
LfcPrewarmState, TlsConfig,
|
||||
LfcPrewarmState, PromoteState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
|
||||
@@ -29,8 +29,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::{spawn, time};
|
||||
use tokio::{spawn, sync::watch, task::JoinHandle, time};
|
||||
use tracing::{Instrument, debug, error, info, instrument, warn};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -175,6 +174,7 @@ pub struct ComputeState {
|
||||
/// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
|
||||
/// mode == ComputeMode::Primary. None otherwise
|
||||
pub terminate_flush_lsn: Option<Lsn>,
|
||||
pub promote_state: Option<watch::Receiver<PromoteState>>,
|
||||
|
||||
pub metrics: ComputeMetrics,
|
||||
}
|
||||
@@ -192,6 +192,7 @@ impl ComputeState {
|
||||
lfc_prewarm_state: LfcPrewarmState::default(),
|
||||
lfc_offload_state: LfcOffloadState::default(),
|
||||
terminate_flush_lsn: None,
|
||||
promote_state: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1057,7 +1058,7 @@ impl ComputeNode {
|
||||
};
|
||||
|
||||
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::Client::new(
|
||||
let mut client = page_api::Client::connect(
|
||||
shard0_connstr,
|
||||
spec.tenant_id,
|
||||
spec.timeline_id,
|
||||
@@ -2433,19 +2434,11 @@ LIMIT 100",
|
||||
// If the value is -1, we never suspend so set the value to default collection.
|
||||
// If the value is 0, it means default, we will just continue to use the default.
|
||||
if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 {
|
||||
info!(
|
||||
"[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}",
|
||||
spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL
|
||||
);
|
||||
self.params.installed_extensions_collection_interval.store(
|
||||
DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL,
|
||||
std::sync::atomic::Ordering::SeqCst,
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
"[NEON_EXT_INT_UPD] Spec Timeout: {}",
|
||||
spec.suspend_timeout_seconds
|
||||
);
|
||||
self.params.installed_extensions_collection_interval.store(
|
||||
spec.suspend_timeout_seconds as u64,
|
||||
std::sync::atomic::Ordering::SeqCst,
|
||||
|
||||
132
compute_tools/src/compute_promote.rs
Normal file
132
compute_tools/src/compute_promote.rs
Normal file
@@ -0,0 +1,132 @@
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::{Context, Result, bail};
|
||||
use compute_api::{
|
||||
responses::{LfcPrewarmState, PromoteState, SafekeepersLsn},
|
||||
spec::ComputeMode,
|
||||
};
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::time::sleep;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
impl ComputeNode {
|
||||
/// Returns only when promote fails or succeeds. If a network error occurs
|
||||
/// and http client disconnects, this does not stop promotion, and subsequent
|
||||
/// calls block until promote finishes.
|
||||
/// Called by control plane on secondary after primary endpoint is terminated
|
||||
pub async fn promote(self: &Arc<Self>, safekeepers_lsn: SafekeepersLsn) -> PromoteState {
|
||||
let cloned = self.clone();
|
||||
let start_promotion = || {
|
||||
let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
|
||||
tokio::spawn(async move {
|
||||
tx.send(match cloned.promote_impl(safekeepers_lsn).await {
|
||||
Ok(_) => PromoteState::Completed,
|
||||
Err(err) => {
|
||||
tracing::error!(%err, "promoting");
|
||||
PromoteState::Failed {
|
||||
error: err.to_string(),
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
rx
|
||||
};
|
||||
|
||||
let mut task;
|
||||
// self.state is unlocked after block ends so we lock it in promote_impl
|
||||
// and task.changed() is reached
|
||||
{
|
||||
task = self
|
||||
.state
|
||||
.lock()
|
||||
.unwrap()
|
||||
.promote_state
|
||||
.get_or_insert_with(start_promotion)
|
||||
.clone()
|
||||
}
|
||||
task.changed().await.expect("promote sender dropped");
|
||||
task.borrow().clone()
|
||||
}
|
||||
|
||||
// Why do we have to supply safekeepers?
|
||||
// For secondary we use primary_connection_conninfo so safekeepers field is empty
|
||||
async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> {
|
||||
{
|
||||
let state = self.state.lock().unwrap();
|
||||
let mode = &state.pspec.as_ref().unwrap().spec.mode;
|
||||
if *mode != ComputeMode::Replica {
|
||||
bail!("{} is not replica", mode.to_type_str());
|
||||
}
|
||||
|
||||
// we don't need to query Postgres so not self.lfc_prewarm_state()
|
||||
match &state.lfc_prewarm_state {
|
||||
LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
|
||||
bail!("prewarm not requested or pending")
|
||||
}
|
||||
LfcPrewarmState::Failed { error } => {
|
||||
tracing::warn!(%error, "replica prewarm failed")
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?;
|
||||
|
||||
let primary_lsn = safekeepers_lsn.wal_flush_lsn;
|
||||
let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
|
||||
const RETRIES: i32 = 20;
|
||||
for i in 0..=RETRIES {
|
||||
let row = client
|
||||
.query_one("SELECT pg_last_wal_replay_lsn()", &[])
|
||||
.await
|
||||
.context("getting last replay lsn")?;
|
||||
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
|
||||
last_wal_replay_lsn = lsn.into();
|
||||
if last_wal_replay_lsn >= primary_lsn {
|
||||
break;
|
||||
}
|
||||
tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
if last_wal_replay_lsn < primary_lsn {
|
||||
bail!("didn't catch up with primary in {RETRIES} retries");
|
||||
}
|
||||
|
||||
// using $1 doesn't work with ALTER SYSTEM SET
|
||||
let safekeepers_sql = format!(
|
||||
"ALTER SYSTEM SET neon.safekeepers='{}'",
|
||||
safekeepers_lsn.safekeepers
|
||||
);
|
||||
client
|
||||
.query(&safekeepers_sql, &[])
|
||||
.await
|
||||
.context("setting safekeepers")?;
|
||||
client
|
||||
.query("SELECT pg_reload_conf()", &[])
|
||||
.await
|
||||
.context("reloading postgres config")?;
|
||||
let row = client
|
||||
.query_one("SELECT * FROM pg_promote()", &[])
|
||||
.await
|
||||
.context("pg_promote")?;
|
||||
if !row.get::<usize, bool>(0) {
|
||||
bail!("pg_promote() returned false");
|
||||
}
|
||||
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?;
|
||||
let row = client
|
||||
.query_one("SHOW transaction_read_only", &[])
|
||||
.await
|
||||
.context("getting transaction_read_only")?;
|
||||
if row.get::<usize, &str>(0) == "on" {
|
||||
bail!("replica in read only mode after promotion");
|
||||
}
|
||||
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -83,6 +83,87 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/DbsAndRoles"
|
||||
|
||||
/promote:
|
||||
post:
|
||||
tags:
|
||||
- Promotion
|
||||
summary: Promote secondary replica to primary
|
||||
description: ""
|
||||
operationId: promoteReplica
|
||||
requestBody:
|
||||
description: Promote requests data
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SafekeepersLsn"
|
||||
responses:
|
||||
200:
|
||||
description: Promote succeeded or wasn't started
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PromoteState"
|
||||
500:
|
||||
description: Promote failed
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PromoteState"
|
||||
|
||||
/lfc/prewarm:
|
||||
post:
|
||||
summary: Request LFC Prewarm
|
||||
parameters:
|
||||
- name: from_endpoint
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: ""
|
||||
operationId: lfcPrewarm
|
||||
responses:
|
||||
202:
|
||||
description: LFC prewarm started
|
||||
429:
|
||||
description: LFC prewarm ongoing
|
||||
get:
|
||||
tags:
|
||||
- Prewarm
|
||||
summary: Get LFC prewarm state
|
||||
description: ""
|
||||
operationId: getLfcPrewarmState
|
||||
responses:
|
||||
200:
|
||||
description: Prewarm state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LfcPrewarmState"
|
||||
|
||||
/lfc/offload:
|
||||
post:
|
||||
summary: Request LFC offload
|
||||
description: ""
|
||||
operationId: lfcOffload
|
||||
responses:
|
||||
202:
|
||||
description: LFC offload started
|
||||
429:
|
||||
description: LFC offload ongoing
|
||||
get:
|
||||
tags:
|
||||
- Prewarm
|
||||
summary: Get LFC offloading state
|
||||
description: ""
|
||||
operationId: getLfcOffloadState
|
||||
responses:
|
||||
200:
|
||||
description: Offload state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LfcOffloadState"
|
||||
|
||||
/database_schema:
|
||||
get:
|
||||
tags:
|
||||
@@ -335,15 +416,6 @@ components:
|
||||
total_startup_ms:
|
||||
type: integer
|
||||
|
||||
Info:
|
||||
type: object
|
||||
description: Information about VM/Pod.
|
||||
required:
|
||||
- num_cpus
|
||||
properties:
|
||||
num_cpus:
|
||||
type: integer
|
||||
|
||||
DbsAndRoles:
|
||||
type: object
|
||||
description: Databases and Roles
|
||||
@@ -497,25 +569,69 @@ components:
|
||||
type: string
|
||||
example: "1.0.0"
|
||||
|
||||
InstalledExtensions:
|
||||
SafekeepersLsn:
|
||||
type: object
|
||||
required:
|
||||
- safekeepers
|
||||
- wal_flush_lsn
|
||||
properties:
|
||||
extensions:
|
||||
description: Contains list of installed extensions.
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
extname:
|
||||
type: string
|
||||
version:
|
||||
type: string
|
||||
items:
|
||||
type: string
|
||||
n_databases:
|
||||
type: integer
|
||||
owned_by_superuser:
|
||||
type: integer
|
||||
safekeepers:
|
||||
description: Primary replica safekeepers
|
||||
type: string
|
||||
wal_flush_lsn:
|
||||
description: Primary last WAL flush LSN
|
||||
type: string
|
||||
|
||||
LfcPrewarmState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
- total
|
||||
- prewarmed
|
||||
- skipped
|
||||
properties:
|
||||
status:
|
||||
description: Lfc prewarm status
|
||||
enum: [not_prewarmed, prewarming, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Lfc prewarm error, if any
|
||||
type: string
|
||||
total:
|
||||
description: Total pages processed
|
||||
type: integer
|
||||
prewarmed:
|
||||
description: Total pages prewarmed
|
||||
type: integer
|
||||
skipped:
|
||||
description: Pages processed but not prewarmed
|
||||
type: integer
|
||||
|
||||
LfcOffloadState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
properties:
|
||||
status:
|
||||
description: Lfc offload status
|
||||
enum: [not_offloaded, offloading, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Lfc offload error, if any
|
||||
type: string
|
||||
|
||||
PromoteState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
properties:
|
||||
status:
|
||||
description: Promote result
|
||||
enum: [not_promoted, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Promote error, if any
|
||||
type: string
|
||||
|
||||
SetRoleGrantsRequest:
|
||||
type: object
|
||||
|
||||
@@ -14,6 +14,7 @@ pub(in crate::http) mod insights;
|
||||
pub(in crate::http) mod lfc;
|
||||
pub(in crate::http) mod metrics;
|
||||
pub(in crate::http) mod metrics_json;
|
||||
pub(in crate::http) mod promote;
|
||||
pub(in crate::http) mod status;
|
||||
pub(in crate::http) mod terminate;
|
||||
|
||||
|
||||
14
compute_tools/src/http/routes/promote.rs
Normal file
14
compute_tools/src/http/routes/promote.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
use crate::http::JsonResponse;
|
||||
use axum::Form;
|
||||
use http::StatusCode;
|
||||
|
||||
pub(in crate::http) async fn promote(
|
||||
compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
|
||||
Form(safekeepers_lsn): Form<compute_api::responses::SafekeepersLsn>,
|
||||
) -> axum::response::Response {
|
||||
let state = compute.promote(safekeepers_lsn).await;
|
||||
if let compute_api::responses::PromoteState::Failed { error } = state {
|
||||
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error);
|
||||
}
|
||||
JsonResponse::success(StatusCode::OK, state)
|
||||
}
|
||||
@@ -23,7 +23,7 @@ use super::{
|
||||
middleware::authorize::Authorize,
|
||||
routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
grants, insights, lfc, metrics, metrics_json, status, terminate,
|
||||
grants, insights, lfc, metrics, metrics_json, promote, status, terminate,
|
||||
},
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
@@ -87,6 +87,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
let authenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
|
||||
.route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
|
||||
.route("/promote", post(promote::promote))
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
|
||||
@@ -12,6 +12,7 @@ pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod compute_prewarm;
|
||||
pub mod compute_promote;
|
||||
pub mod disk_quota;
|
||||
pub mod extension_server;
|
||||
pub mod installed_extensions;
|
||||
|
||||
@@ -192,7 +192,7 @@ fn acquire_lsn_lease_grpc(
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::Client::new(
|
||||
let mut client = page_api::Client::connect(
|
||||
connstring.to_string(),
|
||||
tenant_shard_id.tenant_id,
|
||||
timeline_id,
|
||||
|
||||
@@ -6,7 +6,7 @@ BEGIN
|
||||
admin_option AS admin
|
||||
INTO monitor
|
||||
FROM pg_auth_members
|
||||
WHERE roleid = 'pg_monitor'::regrole
|
||||
WHERE roleid = 'neon_superuser'::regrole
|
||||
AND member = 'pg_monitor'::regrole;
|
||||
|
||||
IF NOT monitor.member THEN
|
||||
|
||||
@@ -13,6 +13,8 @@ use utils::backoff::retry;
|
||||
pub fn app(state: Arc<Storage>) -> Router<()> {
|
||||
use axum::routing::{delete as _delete, get as _get};
|
||||
let delete_prefix = _delete(delete_prefix);
|
||||
// NB: On any changes do not forget to update the OpenAPI spec
|
||||
// in /endpoint_storage/src/openapi_spec.yml.
|
||||
Router::new()
|
||||
.route(
|
||||
"/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",
|
||||
|
||||
146
endpoint_storage/src/openapi_spec.yml
Normal file
146
endpoint_storage/src/openapi_spec.yml
Normal file
@@ -0,0 +1,146 @@
|
||||
openapi: "3.0.2"
|
||||
info:
|
||||
title: Endpoint Storage API
|
||||
description: Endpoint Storage API
|
||||
version: "1.0"
|
||||
license:
|
||||
name: "Apache"
|
||||
url: https://github.com/neondatabase/neon/blob/main/LICENSE
|
||||
servers:
|
||||
- url: ""
|
||||
paths:
|
||||
/status:
|
||||
description: Healthcheck endpoint
|
||||
get:
|
||||
description: Healthcheck
|
||||
security: []
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
|
||||
/{tenant_id}/{timeline_id}/{endpoint_id}/{key}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: endpoint_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: key
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
get:
|
||||
description: Get file from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: "File stream from blob storage"
|
||||
content:
|
||||
application/octet-stream:
|
||||
schema:
|
||||
type: string
|
||||
format: binary
|
||||
"400":
|
||||
description: File was not found
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
put:
|
||||
description: Insert file into blob storage. If file exists, override it
|
||||
requestBody:
|
||||
content:
|
||||
application/octet-stream:
|
||||
schema:
|
||||
type: string
|
||||
format: binary
|
||||
responses:
|
||||
"200":
|
||||
description: File was inserted successfully
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
delete:
|
||||
description: Delete file from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: File was successfully deleted or not found
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
/{tenant_id}/{timeline_id}/{endpoint_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: endpoint_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
delete:
|
||||
description: Delete endpoint data from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: Endpoint data was deleted
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
/{tenant_id}/{timeline_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
delete:
|
||||
description: Delete timeline data from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: Timeline data was deleted
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
/{tenant_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
delete:
|
||||
description: Delete tenant data from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant data was deleted
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
type: http
|
||||
scheme: bearer
|
||||
bearerFormat: JWT
|
||||
|
||||
security:
|
||||
- JWT: []
|
||||
@@ -46,7 +46,7 @@ pub struct ExtensionInstallResponse {
|
||||
pub version: ExtVersion,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone)]
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcPrewarmState {
|
||||
#[default]
|
||||
@@ -58,6 +58,17 @@ pub enum LfcPrewarmState {
|
||||
},
|
||||
}
|
||||
|
||||
impl Display for LfcPrewarmState {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
|
||||
LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
|
||||
LfcPrewarmState::Completed => f.write_str("Completed"),
|
||||
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcOffloadState {
|
||||
@@ -70,6 +81,23 @@ pub enum LfcOffloadState {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
/// Response of /promote
|
||||
pub enum PromoteState {
|
||||
NotPromoted,
|
||||
Completed,
|
||||
Failed { error: String },
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Default, Debug, Clone)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
/// Result of /safekeepers_lsn
|
||||
pub struct SafekeepersLsn {
|
||||
pub safekeepers: String,
|
||||
pub wal_flush_lsn: utils::lsn::Lsn,
|
||||
}
|
||||
|
||||
/// Response of the /status API
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
|
||||
@@ -442,7 +442,7 @@ pub struct JwksSettings {
|
||||
}
|
||||
|
||||
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
|
||||
pub enum PageserverProtocol {
|
||||
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
|
||||
#[default]
|
||||
|
||||
@@ -41,17 +41,35 @@ pub fn get_query_param<'a>(
|
||||
Some(q) => q,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let mut values = url::form_urlencoded::parse(query.as_bytes())
|
||||
let values = url::form_urlencoded::parse(query.as_bytes())
|
||||
.filter_map(|(k, v)| if k == param_name { Some(v) } else { None })
|
||||
// we call .next() twice below. If it's None the first time, .fuse() ensures it's None afterwards
|
||||
.fuse();
|
||||
|
||||
let value1 = values.next();
|
||||
if values.next().is_some() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"param {param_name} specified more than once"
|
||||
)));
|
||||
}
|
||||
// Work around an issue with Alloy's pyroscope scrape where the "seconds"
|
||||
// parameter is added several times. https://github.com/grafana/alloy/issues/3026
|
||||
// TODO: revert after Alloy is fixed.
|
||||
let value1 = values
|
||||
.map(Ok)
|
||||
.reduce(|acc, i| {
|
||||
match acc {
|
||||
Err(_) => acc,
|
||||
|
||||
// It's okay to have duplicates as along as they have the same value.
|
||||
Ok(ref a) if a == &i.unwrap() => acc,
|
||||
|
||||
_ => Err(ApiError::BadRequest(anyhow!(
|
||||
"param {param_name} specified more than once"
|
||||
))),
|
||||
}
|
||||
})
|
||||
.transpose()?;
|
||||
// if values.next().is_some() {
|
||||
// return Err(ApiError::BadRequest(anyhow!(
|
||||
// "param {param_name} specified more than once"
|
||||
// )));
|
||||
// }
|
||||
|
||||
Ok(value1)
|
||||
}
|
||||
|
||||
@@ -92,3 +110,39 @@ pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError>
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_get_query_param_duplicate() {
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri?testparam=1")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam").unwrap();
|
||||
assert_eq!(value.unwrap(), "1");
|
||||
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri?testparam=1&testparam=1")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam").unwrap();
|
||||
assert_eq!(value.unwrap(), "1");
|
||||
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam").unwrap();
|
||||
assert!(value.is_none());
|
||||
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri?testparam=1&testparam=2&testparam=3")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam");
|
||||
assert!(value.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ mod tests;
|
||||
|
||||
use const_format::formatcp;
|
||||
use posthog_client_lite::PostHogClientConfig;
|
||||
use utils::serde_percent::Percent;
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
@@ -223,7 +224,7 @@ pub struct ConfigToml {
|
||||
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||
pub test_remote_failures: u64,
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
#[serde(with = "humantime_serde")]
|
||||
@@ -273,6 +274,7 @@ pub struct ConfigToml {
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct DiskUsageEvictionTaskConfig {
|
||||
pub max_usage_pct: utils::serde_percent::Percent,
|
||||
pub min_avail_bytes: u64,
|
||||
@@ -283,6 +285,21 @@ pub struct DiskUsageEvictionTaskConfig {
|
||||
/// Select sorting for evicted layers
|
||||
#[serde(default)]
|
||||
pub eviction_order: EvictionOrder,
|
||||
pub enabled: bool,
|
||||
}
|
||||
|
||||
impl Default for DiskUsageEvictionTaskConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: EvictionOrder::default(),
|
||||
enabled: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -738,7 +755,7 @@ impl Default for ConfigToml {
|
||||
|
||||
metric_collection_bucket: (None),
|
||||
|
||||
disk_usage_based_eviction: (None),
|
||||
disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
|
||||
|
||||
test_remote_failures: (0),
|
||||
|
||||
|
||||
@@ -384,7 +384,7 @@ pub struct SafekeepersInfo {
|
||||
pub safekeepers: Vec<SafekeeperInfo>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SafekeeperInfo {
|
||||
pub id: NodeId,
|
||||
pub hostname: String,
|
||||
|
||||
@@ -332,7 +332,11 @@ fn hash_combine(mut a: u32, mut b: u32) -> u32 {
|
||||
///
|
||||
/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
|
||||
/// and will be handled at higher levels when shards are split.
|
||||
fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
|
||||
pub fn key_to_shard_number(
|
||||
count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
key: &Key,
|
||||
) -> ShardNumber {
|
||||
// Fast path for un-sharded tenants or broadcast keys
|
||||
if count < ShardCount(2) || key_is_shard0(key) {
|
||||
return ShardNumber(0);
|
||||
|
||||
@@ -13,6 +13,7 @@ aws-smithy-async.workspace = true
|
||||
aws-smithy-types.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
base64.workspace = true
|
||||
bytes.workspace = true
|
||||
camino = { workspace = true, features = ["serde1"] }
|
||||
humantime-serde.workspace = true
|
||||
@@ -41,6 +42,8 @@ http-body-util.workspace = true
|
||||
itertools.workspace = true
|
||||
sync_wrapper = { workspace = true, features = ["futures"] }
|
||||
|
||||
byteorder = "1.4"
|
||||
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
test-context.workspace = true
|
||||
|
||||
@@ -14,17 +14,25 @@ use anyhow::{Context, Result, anyhow};
|
||||
use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
|
||||
use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::blob::operations::GetBlobBuilder;
|
||||
use azure_storage_blobs::blob::BlobBlockType;
|
||||
use azure_storage_blobs::blob::BlockList;
|
||||
use azure_storage_blobs::blob::{Blob, CopyStatus};
|
||||
use azure_storage_blobs::container::operations::ListBlobsBuilder;
|
||||
use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient};
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||
use base64::{Engine as _, engine::general_purpose::URL_SAFE};
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::FutureExt;
|
||||
use futures::future::Either;
|
||||
use futures::stream::Stream;
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use http_types::{StatusCode, Url};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::io::AsyncSeekExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
use utils::backoff;
|
||||
@@ -51,6 +59,9 @@ pub struct AzureBlobStorage {
|
||||
|
||||
// Alternative timeout used for metadata objects which are expected to be small
|
||||
pub small_timeout: Duration,
|
||||
/* BEGIN_HADRON */
|
||||
pub put_block_size_mb: Option<usize>,
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
impl AzureBlobStorage {
|
||||
@@ -107,6 +118,9 @@ impl AzureBlobStorage {
|
||||
concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
|
||||
timeout,
|
||||
small_timeout,
|
||||
/* BEGIN_HADRON */
|
||||
put_block_size_mb: azure_config.put_block_size_mb,
|
||||
/* END_HADRON */
|
||||
})
|
||||
}
|
||||
|
||||
@@ -583,31 +597,137 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let op = async {
|
||||
let mut metadata_map = metadata.unwrap_or([].into());
|
||||
let timeline_file_path = metadata_map.0.remove("databricks_azure_put_block");
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let op = async move {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
let put_block_size = self.put_block_size_mb.unwrap_or(0) * 1024 * 1024;
|
||||
if timeline_file_path.is_none() || put_block_size == 0 {
|
||||
// Use put_block_blob directly.
|
||||
let from: Pin<
|
||||
Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>,
|
||||
> = Box::pin(from);
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
|
||||
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
|
||||
Box::pin(from);
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
if !metadata_map.0.is_empty() {
|
||||
builder = builder.metadata(to_azure_metadata(metadata_map));
|
||||
}
|
||||
let fut = builder.into_future();
|
||||
let fut = tokio::time::timeout(self.timeout, fut);
|
||||
let result = fut.await;
|
||||
match result {
|
||||
Ok(Ok(_response)) => return Ok(()),
|
||||
Ok(Err(azure)) => return Err(azure.into()),
|
||||
Err(_timeout) => return Err(TimeoutOrCancel::Timeout.into()),
|
||||
};
|
||||
}
|
||||
// Upload chunks concurrently using Put Block.
|
||||
// Each PutBlock uploads put_block_size bytes of the file.
|
||||
let mut upload_futures: Vec<tokio::task::JoinHandle<Result<(), azure_core::Error>>> =
|
||||
vec![];
|
||||
let mut block_list = BlockList::default();
|
||||
let mut start_bytes = 0u64;
|
||||
let mut remaining_bytes = data_size_bytes;
|
||||
let mut block_list_count = 0;
|
||||
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
while remaining_bytes > 0 {
|
||||
let block_size = std::cmp::min(remaining_bytes, put_block_size);
|
||||
let end_bytes = start_bytes + block_size as u64;
|
||||
let block_id = block_list_count;
|
||||
let timeout = self.timeout;
|
||||
let blob_client = blob_client.clone();
|
||||
let timeline_file = timeline_file_path.clone().unwrap().clone();
|
||||
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
let mut encoded_block_id = [0u8; 8];
|
||||
BigEndian::write_u64(&mut encoded_block_id, block_id);
|
||||
URL_SAFE.encode(encoded_block_id);
|
||||
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
// Put one block.
|
||||
let part_fut = async move {
|
||||
let mut file = File::open(Utf8Path::new(&timeline_file.clone())).await?;
|
||||
file.seek(io::SeekFrom::Start(start_bytes)).await?;
|
||||
let limited_reader = file.take(block_size as u64);
|
||||
let file_chunk_stream =
|
||||
tokio_util::io::ReaderStream::with_capacity(limited_reader, 1024 * 1024);
|
||||
let file_chunk_stream_pin: Pin<
|
||||
Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>,
|
||||
> = Box::pin(file_chunk_stream);
|
||||
let stream_wrapper = NonSeekableStream::new(file_chunk_stream_pin, block_size);
|
||||
let body = azure_core::Body::SeekableStream(Box::new(stream_wrapper));
|
||||
// Azure put block takes URL-encoded block ids and all blocks must have the same byte length.
|
||||
// https://learn.microsoft.com/en-us/rest/api/storageservices/put-block?tabs=microsoft-entra-id#uri-parameters
|
||||
let builder = blob_client.put_block(encoded_block_id.to_vec(), body);
|
||||
let fut = builder.into_future();
|
||||
let fut = tokio::time::timeout(timeout, fut);
|
||||
let result = fut.await;
|
||||
tracing::debug!(
|
||||
"azure put block id-{} size {} start {} end {} file {} response {:#?}",
|
||||
block_id,
|
||||
block_size,
|
||||
start_bytes,
|
||||
end_bytes,
|
||||
timeline_file,
|
||||
result
|
||||
);
|
||||
match result {
|
||||
Ok(Ok(_response)) => Ok(()),
|
||||
Ok(Err(azure)) => Err(azure),
|
||||
Err(_timeout) => Err(azure_core::Error::new(
|
||||
azure_core::error::ErrorKind::Io,
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::TimedOut,
|
||||
"Operation timed out",
|
||||
),
|
||||
)),
|
||||
}
|
||||
};
|
||||
upload_futures.push(tokio::spawn(part_fut));
|
||||
|
||||
if let Some(metadata) = metadata {
|
||||
builder = builder.metadata(to_azure_metadata(metadata));
|
||||
block_list_count += 1;
|
||||
remaining_bytes -= block_size;
|
||||
start_bytes += block_size as u64;
|
||||
|
||||
block_list
|
||||
.blocks
|
||||
.push(BlobBlockType::Uncommitted(encoded_block_id.to_vec().into()));
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
"azure put blocks {} total MB: {} chunk size MB: {}",
|
||||
block_list_count,
|
||||
data_size_bytes / 1024 / 1024,
|
||||
put_block_size / 1024 / 1024
|
||||
);
|
||||
// Wait for all blocks to be uploaded.
|
||||
let upload_results = futures::future::try_join_all(upload_futures).await;
|
||||
if upload_results.is_err() {
|
||||
return Err(anyhow::anyhow!(format!(
|
||||
"Failed to upload all blocks {:#?}",
|
||||
upload_results.unwrap_err()
|
||||
)));
|
||||
}
|
||||
|
||||
// Commit the blocks.
|
||||
let mut builder = blob_client.put_block_list(block_list);
|
||||
if !metadata_map.0.is_empty() {
|
||||
builder = builder.metadata(to_azure_metadata(metadata_map));
|
||||
}
|
||||
let fut = builder.into_future();
|
||||
let fut = tokio::time::timeout(self.timeout, fut);
|
||||
let result = fut.await;
|
||||
tracing::debug!("azure put block list response {:#?}", result);
|
||||
|
||||
match fut.await {
|
||||
match result {
|
||||
Ok(Ok(_response)) => Ok(()),
|
||||
Ok(Err(azure)) => Err(azure.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
|
||||
}
|
||||
};
|
||||
/* END_HADRON */
|
||||
|
||||
let res = tokio::select! {
|
||||
res = op => res,
|
||||
@@ -622,7 +742,6 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, outcome, started_at);
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
|
||||
@@ -195,8 +195,19 @@ pub struct AzureConfig {
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
#[serde(default = "default_azure_conn_pool_size")]
|
||||
pub conn_pool_size: usize,
|
||||
/* BEGIN_HADRON */
|
||||
#[serde(default = "default_azure_put_block_size_mb")]
|
||||
pub put_block_size_mb: Option<usize>,
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
fn default_azure_put_block_size_mb() -> Option<usize> {
|
||||
// Disable parallel upload by default.
|
||||
Some(0)
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
|
||||
NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
|
||||
}
|
||||
@@ -213,6 +224,9 @@ impl Debug for AzureConfig {
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
/* BEGIN_HADRON */
|
||||
.field("put_block_size_mb", &self.put_block_size_mb)
|
||||
/* END_HADRON */
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -352,6 +366,7 @@ timeout = '5s'";
|
||||
upload_storage_class = 'INTELLIGENT_TIERING'
|
||||
timeout = '7s'
|
||||
conn_pool_size = 8
|
||||
put_block_size_mb = 1024
|
||||
";
|
||||
|
||||
let config = parse(toml).unwrap();
|
||||
@@ -367,6 +382,9 @@ timeout = '5s'";
|
||||
concurrency_limit: default_remote_storage_azure_concurrency_limit(),
|
||||
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
conn_pool_size: 8,
|
||||
/* BEGIN_HADRON */
|
||||
put_block_size_mb: Some(1024),
|
||||
/* END_HADRON */
|
||||
}),
|
||||
timeout: Duration::from_secs(7),
|
||||
small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
|
||||
|
||||
@@ -165,10 +165,42 @@ pub(crate) async fn upload_remote_data(
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let mut metadata = None;
|
||||
if matches!(&*task_client, GenericRemoteStorage::AzureBlob(_)) {
|
||||
let file_path = "/tmp/dbx_upload_tmp_file.txt";
|
||||
{
|
||||
// Open the file in append mode
|
||||
let mut file = std::fs::OpenOptions::new()
|
||||
.append(true)
|
||||
.create(true) // Create the file if it doesn't exist
|
||||
.open(file_path)?;
|
||||
// Append some bytes to the file
|
||||
std::io::Write::write_all(
|
||||
&mut file,
|
||||
&format!("remote blob data {i}").into_bytes(),
|
||||
)?;
|
||||
file.sync_all()?;
|
||||
}
|
||||
metadata = Some(remote_storage::StorageMetadata::from([(
|
||||
"databricks_azure_put_block",
|
||||
file_path,
|
||||
)]));
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
task_client
|
||||
.upload(data, data_len, &blob_path, None, &cancel)
|
||||
.upload(data, data_len, &blob_path, metadata, &cancel)
|
||||
.await?;
|
||||
|
||||
// TODO: Check upload is using the put_block upload.
|
||||
// We cannot consume data here since data is moved inside the upload.
|
||||
// let total_bytes = data.fold(0, |acc, chunk| async move {
|
||||
// acc + chunk.map(|bytes| bytes.len()).unwrap_or(0)
|
||||
// }).await;
|
||||
// assert_eq!(total_bytes, data_len);
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
@@ -219,6 +219,9 @@ async fn create_azure_client(
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response,
|
||||
conn_pool_size: 8,
|
||||
/* BEGIN_HADRON */
|
||||
put_block_size_mb: Some(1),
|
||||
/* END_HADRON */
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
|
||||
|
||||
@@ -171,6 +171,12 @@ impl std::fmt::Display for ShardNumber {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardCount {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardSlug<'_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
|
||||
@@ -112,6 +112,7 @@ twox-hash.workspace = true
|
||||
procfs.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
base64.workspace = true
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
|
||||
19
pageserver/client_grpc/Cargo.toml
Normal file
19
pageserver/client_grpc/Cargo.toml
Normal file
@@ -0,0 +1,19 @@
|
||||
[package]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
compute_api.workspace = true
|
||||
futures.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_page_api.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tonic.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
403
pageserver/client_grpc/src/client.rs
Normal file
403
pageserver/client_grpc/src/client.rs
Normal file
@@ -0,0 +1,403 @@
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZero;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::anyhow;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt as _, StreamExt as _};
|
||||
use tracing::instrument;
|
||||
|
||||
use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
|
||||
use crate::retry::Retry;
|
||||
use crate::split::GetPageSplitter;
|
||||
use compute_api::spec::PageserverProtocol;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
/// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
|
||||
/// when full.
|
||||
///
|
||||
/// TODO: tune all of these constants, and consider making them configurable.
|
||||
/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
|
||||
/// with only streams.
|
||||
const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
|
||||
|
||||
/// Max number of concurrent unary request clients per shard.
|
||||
const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
|
||||
|
||||
/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage
|
||||
/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`.
|
||||
const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
|
||||
|
||||
/// Max number of pipelined requests per stream.
|
||||
const MAX_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(2).unwrap();
|
||||
|
||||
/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
|
||||
/// are more throughput-oriented, we have a smaller limit but higher queue depth.
|
||||
const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
|
||||
|
||||
/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus
|
||||
/// get a larger queue depth.
|
||||
const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
|
||||
|
||||
/// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
|
||||
/// basic `page_api::Client` gRPC client, and supports:
|
||||
///
|
||||
/// * Sharded tenants across multiple Pageservers.
|
||||
/// * Pooling of connections, clients, and streams for efficient resource use.
|
||||
/// * Concurrent use by many callers.
|
||||
/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling.
|
||||
/// * Automatic retries.
|
||||
/// * Observability.
|
||||
///
|
||||
/// TODO: this client does not support base backups or LSN leases, as these are only used by
|
||||
/// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
|
||||
pub struct PageserverClient {
|
||||
// TODO: support swapping out the shard map, e.g. via an ArcSwap.
|
||||
shards: Shards,
|
||||
retry: Retry,
|
||||
}
|
||||
|
||||
impl PageserverClient {
|
||||
/// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given
|
||||
/// in the shard map, which must be complete and must use gRPC URLs.
|
||||
pub fn new(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_map: HashMap<ShardIndex, String>,
|
||||
stripe_size: ShardStripeSize,
|
||||
auth_token: Option<String>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let shards = Shards::new(tenant_id, timeline_id, shard_map, stripe_size, auth_token)?;
|
||||
Ok(Self {
|
||||
shards,
|
||||
retry: Retry,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns whether a relation exists.
|
||||
#[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
|
||||
pub async fn check_rel_exists(
|
||||
&self,
|
||||
req: page_api::CheckRelExistsRequest,
|
||||
) -> tonic::Result<page_api::CheckRelExistsResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.get_zero().client().await?;
|
||||
client.check_rel_exists(req).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Returns the total size of a database, as # of bytes.
|
||||
#[instrument(skip_all, fields(db_oid=%req.db_oid, lsn=%req.read_lsn))]
|
||||
pub async fn get_db_size(
|
||||
&self,
|
||||
req: page_api::GetDbSizeRequest,
|
||||
) -> tonic::Result<page_api::GetDbSizeResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.get_zero().client().await?;
|
||||
client.get_db_size(req).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically
|
||||
/// splits requests that straddle shard boundaries, and assembles the responses.
|
||||
///
|
||||
/// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
|
||||
/// errors. All responses will have `GetPageStatusCode::Ok`.
|
||||
#[instrument(skip_all, fields(
|
||||
req_id = %req.request_id,
|
||||
class = %req.request_class,
|
||||
rel = %req.rel,
|
||||
blkno = %req.block_numbers[0],
|
||||
blks = %req.block_numbers.len(),
|
||||
lsn = %req.read_lsn,
|
||||
))]
|
||||
pub async fn get_page(
|
||||
&self,
|
||||
req: page_api::GetPageRequest,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
// Make sure we have at least one page.
|
||||
if req.block_numbers.is_empty() {
|
||||
return Err(tonic::Status::invalid_argument("no block number"));
|
||||
}
|
||||
|
||||
// Fast path: request is for a single shard.
|
||||
if let Some(shard_id) =
|
||||
GetPageSplitter::is_single_shard(&req, self.shards.count, self.shards.stripe_size)
|
||||
{
|
||||
return self.get_page_for_shard(shard_id, req).await;
|
||||
}
|
||||
|
||||
// Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
|
||||
// reassemble the responses.
|
||||
//
|
||||
// TODO: when we support shard map updates, we need to detect when it changes and re-split
|
||||
// the request on errors.
|
||||
let mut splitter = GetPageSplitter::split(req, self.shards.count, self.shards.stripe_size);
|
||||
|
||||
let mut shard_requests: FuturesUnordered<_> = splitter
|
||||
.drain_requests()
|
||||
.map(|(shard_id, shard_req)| {
|
||||
// NB: each request will retry internally.
|
||||
self.get_page_for_shard(shard_id, shard_req)
|
||||
.map(move |result| result.map(|resp| (shard_id, resp)))
|
||||
})
|
||||
.collect();
|
||||
|
||||
while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
|
||||
splitter.add_response(shard_id, shard_response)?;
|
||||
}
|
||||
|
||||
splitter.assemble_response()
|
||||
}
|
||||
|
||||
/// Fetches pages that belong to the given shard.
|
||||
#[instrument(skip_all, fields(shard = %shard_id))]
|
||||
async fn get_page_for_shard(
|
||||
&self,
|
||||
shard_id: ShardIndex,
|
||||
req: page_api::GetPageRequest,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
let resp = self
|
||||
.retry
|
||||
.with(async || {
|
||||
let stream = self
|
||||
.shards
|
||||
.get(shard_id)?
|
||||
.stream(req.request_class.is_bulk())
|
||||
.await;
|
||||
let resp = stream.send(req.clone()).await?;
|
||||
|
||||
// Convert per-request errors into a tonic::Status.
|
||||
if resp.status_code != page_api::GetPageStatusCode::Ok {
|
||||
return Err(tonic::Status::new(
|
||||
resp.status_code.into(),
|
||||
resp.reason.unwrap_or_else(|| String::from("unknown error")),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(resp)
|
||||
})
|
||||
.await?;
|
||||
|
||||
// Make sure we got the right number of pages.
|
||||
// NB: check outside of the retry loop, since we don't want to retry this.
|
||||
let (expected, actual) = (req.block_numbers.len(), resp.page_images.len());
|
||||
if expected != actual {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"expected {expected} pages for shard {shard_id}, got {actual}",
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Returns the size of a relation, as # of blocks.
|
||||
#[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
|
||||
pub async fn get_rel_size(
|
||||
&self,
|
||||
req: page_api::GetRelSizeRequest,
|
||||
) -> tonic::Result<page_api::GetRelSizeResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.get_zero().client().await?;
|
||||
client.get_rel_size(req).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Fetches an SLRU segment.
|
||||
#[instrument(skip_all, fields(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn))]
|
||||
pub async fn get_slru_segment(
|
||||
&self,
|
||||
req: page_api::GetSlruSegmentRequest,
|
||||
) -> tonic::Result<page_api::GetSlruSegmentResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
// SLRU segments are only available on shard 0.
|
||||
let mut client = self.shards.get_zero().client().await?;
|
||||
client.get_slru_segment(req).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Tracks the tenant's shards.
|
||||
struct Shards {
|
||||
/// The shard count.
|
||||
///
|
||||
/// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
|
||||
count: ShardCount,
|
||||
/// The stripe size. Only used for sharded tenants.
|
||||
stripe_size: ShardStripeSize,
|
||||
/// Shards by shard index.
|
||||
///
|
||||
/// NB: unsharded tenants use count 0, like `ShardIndex::unsharded()`.
|
||||
///
|
||||
/// INVARIANT: every shard 0..count is present.
|
||||
/// INVARIANT: shard 0 is always present.
|
||||
map: HashMap<ShardIndex, Shard>,
|
||||
}
|
||||
|
||||
impl Shards {
|
||||
/// Creates a new set of shards based on a shard map.
|
||||
fn new(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_map: HashMap<ShardIndex, String>,
|
||||
stripe_size: ShardStripeSize,
|
||||
auth_token: Option<String>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let count = match shard_map.len() {
|
||||
0 => return Err(anyhow!("no shards provided")),
|
||||
1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()`
|
||||
n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")),
|
||||
n => ShardCount::new(n as u8),
|
||||
};
|
||||
|
||||
let mut map = HashMap::new();
|
||||
for (shard_id, url) in shard_map {
|
||||
// The shard index must match the computed shard count, even for unsharded tenants.
|
||||
if shard_id.shard_count != count {
|
||||
return Err(anyhow!("invalid shard index {shard_id}, expected {count}"));
|
||||
}
|
||||
// The shard index' number and count must be consistent.
|
||||
if !shard_id.is_unsharded() && shard_id.shard_number.0 >= shard_id.shard_count.0 {
|
||||
return Err(anyhow!("invalid shard index {shard_id}"));
|
||||
}
|
||||
// The above conditions guarantee that we have all shards 0..count: len() matches count,
|
||||
// shard number < count, and numbers are unique (via hashmap).
|
||||
let shard = Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?;
|
||||
map.insert(shard_id, shard);
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
count,
|
||||
stripe_size,
|
||||
map,
|
||||
})
|
||||
}
|
||||
|
||||
/// Looks up the given shard.
|
||||
#[allow(clippy::result_large_err)] // TODO: check perf impact
|
||||
fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> {
|
||||
self.map
|
||||
.get(&shard_id)
|
||||
.ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}")))
|
||||
}
|
||||
|
||||
/// Returns shard 0.
|
||||
fn get_zero(&self) -> &Shard {
|
||||
self.get(ShardIndex::new(ShardNumber(0), self.count))
|
||||
.expect("always present")
|
||||
}
|
||||
}
|
||||
|
||||
/// A single shard. Uses dedicated resource pools with the following structure:
|
||||
///
|
||||
/// * Channel pool: unbounded.
|
||||
/// * Unary client pool: MAX_UNARY_CLIENTS.
|
||||
/// * Stream client pool: unbounded.
|
||||
/// * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH.
|
||||
/// * Bulk channel pool: unbounded.
|
||||
/// * Bulk client pool: unbounded.
|
||||
/// * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
|
||||
struct Shard {
|
||||
/// Unary gRPC client pool.
|
||||
client_pool: Arc<ClientPool>,
|
||||
/// GetPage stream pool.
|
||||
stream_pool: Arc<StreamPool>,
|
||||
/// GetPage stream pool for bulk requests, e.g. prefetches.
|
||||
bulk_stream_pool: Arc<StreamPool>,
|
||||
}
|
||||
|
||||
impl Shard {
|
||||
/// Creates a new shard. It has its own dedicated resource pools.
|
||||
fn new(
|
||||
url: String,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Sanity-check that the URL uses gRPC.
|
||||
if PageserverProtocol::from_connstring(&url)? != PageserverProtocol::Grpc {
|
||||
return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
|
||||
}
|
||||
|
||||
// Common channel pool for unary and stream requests. Bounded by client/stream pools.
|
||||
let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
|
||||
|
||||
// Client pool for unary requests.
|
||||
let client_pool = ClientPool::new(
|
||||
channel_pool.clone(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token.clone(),
|
||||
Some(MAX_UNARY_CLIENTS),
|
||||
);
|
||||
|
||||
// GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients,
|
||||
// but shares a channel pool with it (as it's unbounded).
|
||||
let stream_pool = StreamPool::new(
|
||||
ClientPool::new(
|
||||
channel_pool.clone(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token.clone(),
|
||||
None, // unbounded, limited by stream pool
|
||||
),
|
||||
Some(MAX_STREAMS),
|
||||
MAX_STREAM_QUEUE_DEPTH,
|
||||
);
|
||||
|
||||
// Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
|
||||
// to avoid head-of-line blocking of latency-sensitive requests.
|
||||
let bulk_stream_pool = StreamPool::new(
|
||||
ClientPool::new(
|
||||
ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token,
|
||||
None, // unbounded, limited by stream pool
|
||||
),
|
||||
Some(MAX_BULK_STREAMS),
|
||||
MAX_BULK_STREAM_QUEUE_DEPTH,
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
client_pool,
|
||||
stream_pool,
|
||||
bulk_stream_pool,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns a pooled client for this shard.
|
||||
async fn client(&self) -> tonic::Result<ClientGuard> {
|
||||
self.client_pool
|
||||
.get()
|
||||
.await
|
||||
.map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
|
||||
}
|
||||
|
||||
/// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
|
||||
/// pool (e.g. for prefetches).
|
||||
async fn stream(&self, bulk: bool) -> StreamGuard {
|
||||
match bulk {
|
||||
false => self.stream_pool.get().await,
|
||||
true => self.bulk_stream_pool.get().await,
|
||||
}
|
||||
}
|
||||
}
|
||||
6
pageserver/client_grpc/src/lib.rs
Normal file
6
pageserver/client_grpc/src/lib.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
mod client;
|
||||
mod pool;
|
||||
mod retry;
|
||||
mod split;
|
||||
|
||||
pub use client::PageserverClient;
|
||||
598
pageserver/client_grpc/src/pool.rs
Normal file
598
pageserver/client_grpc/src/pool.rs
Normal file
@@ -0,0 +1,598 @@
|
||||
//! This module provides various Pageserver gRPC client resource pools.
|
||||
//!
|
||||
//! These pools are designed to reuse gRPC resources (connections, clients, and streams) across
|
||||
//! multiple concurrent callers (i.e. Postgres backends). This avoids the resource cost and latency
|
||||
//! of creating dedicated TCP connections and server tasks for every Postgres backend.
|
||||
//!
|
||||
//! Each resource has its own, nested pool. The pools are custom-built for the properties of each
|
||||
//! resource -- they are different enough that a generic pool isn't suitable.
|
||||
//!
|
||||
//! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
|
||||
//! can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
|
||||
//! per-channel client limit. Channels may be closed when they are no longer used by any clients.
|
||||
//!
|
||||
//! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
|
||||
//! channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
|
||||
//! single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
|
||||
//! from the pool after some time, to free up the channel.
|
||||
//!
|
||||
//! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
|
||||
//! ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
|
||||
//! returns a guard that can be used to send a single request, to properly enforce queue depth and
|
||||
//! route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
|
||||
//! possibly pipelining multiple requests from multiple callers on the same stream (up to some
|
||||
//! queue depth). Idle streams may be removed from the pool after a while to free up the client.
|
||||
//!
|
||||
//! Each channel corresponds to one TCP connection. Each client unary request and each stream
|
||||
//! corresponds to one HTTP/2 stream and server task.
|
||||
//!
|
||||
//! TODO: error handling (including custom error types).
|
||||
//! TODO: observability.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::num::NonZero;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex, Weak};
|
||||
|
||||
use futures::StreamExt as _;
|
||||
use tokio::sync::mpsc::{Receiver, Sender};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
use tracing::{error, warn};
|
||||
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
/// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
|
||||
/// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this.
|
||||
/// The pool does not limit the number of channels, and instead relies on `ClientPool` or
|
||||
/// `StreamPool` to limit the number of concurrent clients.
|
||||
///
|
||||
/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
|
||||
///
|
||||
/// TODO: reap idle channels.
|
||||
/// TODO: consider prewarming a set of channels, to avoid initial connection latency.
|
||||
/// TODO: consider adding a circuit breaker for errors and fail fast.
|
||||
pub struct ChannelPool {
|
||||
/// Pageserver endpoint to connect to.
|
||||
endpoint: Endpoint,
|
||||
/// Max number of clients per channel. Beyond this, a new channel will be created.
|
||||
max_clients_per_channel: NonZero<usize>,
|
||||
/// Open channels.
|
||||
channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
|
||||
/// Channel ID generator.
|
||||
next_channel_id: AtomicUsize,
|
||||
}
|
||||
|
||||
type ChannelID = usize;
|
||||
|
||||
struct ChannelEntry {
|
||||
/// The gRPC channel (i.e. TCP connection). Shared by multiple clients.
|
||||
channel: Channel,
|
||||
/// Number of clients using this channel.
|
||||
clients: usize,
|
||||
}
|
||||
|
||||
impl ChannelPool {
|
||||
/// Creates a new channel pool for the given Pageserver endpoint.
|
||||
pub fn new<E>(endpoint: E, max_clients_per_channel: NonZero<usize>) -> anyhow::Result<Arc<Self>>
|
||||
where
|
||||
E: TryInto<Endpoint> + Send + Sync + 'static,
|
||||
<E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
|
||||
{
|
||||
Ok(Arc::new(Self {
|
||||
endpoint: endpoint.try_into()?,
|
||||
max_clients_per_channel,
|
||||
channels: Mutex::default(),
|
||||
next_channel_id: AtomicUsize::default(),
|
||||
}))
|
||||
}
|
||||
|
||||
/// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
|
||||
///
|
||||
/// This never blocks (except for mutex acquisition). The channel is connected lazily on first
|
||||
/// use, and the `ChannelPool` does not have a channel limit. Channels will be re-established
|
||||
/// automatically on failure (TODO: verify).
|
||||
///
|
||||
/// Callers should not clone the returned channel, and must hold onto the returned guard as long
|
||||
/// as the channel is in use. It is unfortunately not possible to enforce this: the Protobuf
|
||||
/// client requires an owned `Channel` and we don't have access to the channel's internal
|
||||
/// refcount.
|
||||
///
|
||||
/// This is not performance-sensitive. It is only called when creating a new client, and clients
|
||||
/// are pooled and reused by `ClientPool`. The total number of channels will also be small. O(n)
|
||||
/// performance is therefore okay.
|
||||
pub fn get(self: &Arc<Self>) -> ChannelGuard {
|
||||
let mut channels = self.channels.lock().unwrap();
|
||||
|
||||
// Try to find an existing channel with available capacity. We check entries in BTreeMap
|
||||
// order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
|
||||
// with lower-ordered channel IDs first. This will cluster clients in lower-ordered
|
||||
// channels, and free up higher-ordered channels such that they can be reaped.
|
||||
for (&id, entry) in channels.iter_mut() {
|
||||
assert!(
|
||||
entry.clients <= self.max_clients_per_channel.get(),
|
||||
"channel overflow"
|
||||
);
|
||||
if entry.clients < self.max_clients_per_channel.get() {
|
||||
entry.clients += 1;
|
||||
return ChannelGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id,
|
||||
channel: Some(entry.channel.clone()),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Create a new channel. We connect lazily on first use, such that we don't block here and
|
||||
// other clients can join onto the same channel while it's connecting.
|
||||
let channel = self.endpoint.connect_lazy();
|
||||
|
||||
let id = self.next_channel_id.fetch_add(1, Ordering::Relaxed);
|
||||
let entry = ChannelEntry {
|
||||
channel: channel.clone(),
|
||||
clients: 1, // account for the guard below
|
||||
};
|
||||
channels.insert(id, entry);
|
||||
|
||||
ChannelGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id,
|
||||
channel: Some(channel),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
|
||||
/// since the gRPC client requires an owned `Channel`.
|
||||
pub struct ChannelGuard {
|
||||
pool: Weak<ChannelPool>,
|
||||
id: ChannelID,
|
||||
channel: Option<Channel>,
|
||||
}
|
||||
|
||||
impl ChannelGuard {
|
||||
/// Returns the inner owned channel. Panics if called more than once. The caller must hold onto
|
||||
/// the guard as long as the channel is in use, and should not clone it.
|
||||
pub fn take(&mut self) -> Channel {
|
||||
self.channel.take().expect("channel already taken")
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the channel to the pool.
|
||||
impl Drop for ChannelGuard {
|
||||
fn drop(&mut self) {
|
||||
let Some(pool) = self.pool.upgrade() else {
|
||||
return; // pool was dropped
|
||||
};
|
||||
let mut channels = pool.channels.lock().unwrap();
|
||||
let entry = channels.get_mut(&self.id).expect("unknown channel");
|
||||
assert!(entry.clients > 0, "channel underflow");
|
||||
entry.clients -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner
|
||||
/// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total
|
||||
/// number of concurrent clients to `max_clients` via semaphore.
|
||||
///
|
||||
/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
|
||||
///
|
||||
/// TODO: reap idle clients.
|
||||
pub struct ClientPool {
|
||||
/// Tenant ID.
|
||||
tenant_id: TenantId,
|
||||
/// Timeline ID.
|
||||
timeline_id: TimelineId,
|
||||
/// Shard ID.
|
||||
shard_id: ShardIndex,
|
||||
/// Authentication token, if any.
|
||||
auth_token: Option<String>,
|
||||
/// Channel pool to acquire channels from.
|
||||
channel_pool: Arc<ChannelPool>,
|
||||
/// Limits the max number of concurrent clients for this pool. None if the pool is unbounded.
|
||||
limiter: Option<Arc<Semaphore>>,
|
||||
/// Idle pooled clients. Acquired clients are removed from here and returned on drop.
|
||||
///
|
||||
/// The first client in the map will be acquired next. The map is sorted by client ID, which in
|
||||
/// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
|
||||
/// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
|
||||
/// clients are reaped.
|
||||
idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
|
||||
/// Unique client ID generator.
|
||||
next_client_id: AtomicUsize,
|
||||
}
|
||||
|
||||
type ClientID = (ChannelID, usize);
|
||||
|
||||
struct ClientEntry {
|
||||
/// The pooled gRPC client.
|
||||
client: page_api::Client,
|
||||
/// The channel guard for the channel used by the client.
|
||||
channel_guard: ChannelGuard,
|
||||
}
|
||||
|
||||
impl ClientPool {
|
||||
/// Creates a new client pool for the given tenant shard. Channels are acquired from the given
|
||||
/// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. Allows up to
|
||||
/// `max_clients` concurrent clients, or unbounded if None.
|
||||
pub fn new(
|
||||
channel_pool: Arc<ChannelPool>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
max_clients: Option<NonZero<usize>>,
|
||||
) -> Arc<Self> {
|
||||
Arc::new(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token,
|
||||
channel_pool,
|
||||
idle: Mutex::default(),
|
||||
limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))),
|
||||
next_client_id: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Gets a client from the pool, or creates a new one if necessary. Connections are established
|
||||
/// lazily and do not block, but this call can block if the pool is at `max_clients`. The client
|
||||
/// is returned to the pool when the guard is dropped.
|
||||
///
|
||||
/// This is moderately performance-sensitive. It is called for every unary request, but these
|
||||
/// establish a new gRPC stream per request so they're already expensive. GetPage requests use
|
||||
/// the `StreamPool` instead.
|
||||
pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
|
||||
// Acquire a permit if the pool is bounded.
|
||||
let mut permit = None;
|
||||
if let Some(limiter) = self.limiter.clone() {
|
||||
permit = Some(limiter.acquire_owned().await.expect("never closed"));
|
||||
}
|
||||
|
||||
// Fast path: acquire an idle client from the pool.
|
||||
if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() {
|
||||
return Ok(ClientGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id,
|
||||
client: Some(entry.client),
|
||||
channel_guard: Some(entry.channel_guard),
|
||||
permit,
|
||||
});
|
||||
}
|
||||
|
||||
// Slow path: construct a new client.
|
||||
let mut channel_guard = self.channel_pool.get();
|
||||
let client = page_api::Client::new(
|
||||
channel_guard.take(),
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.shard_id,
|
||||
self.auth_token.clone(),
|
||||
None,
|
||||
)?;
|
||||
|
||||
Ok(ClientGuard {
|
||||
pool: Arc::downgrade(self),
|
||||
id: (
|
||||
channel_guard.id,
|
||||
self.next_client_id.fetch_add(1, Ordering::Relaxed),
|
||||
),
|
||||
client: Some(client),
|
||||
channel_guard: Some(channel_guard),
|
||||
permit,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A client acquired from the pool. The inner client can be accessed via Deref. The client is
|
||||
/// returned to the pool when dropped.
|
||||
pub struct ClientGuard {
|
||||
pool: Weak<ClientPool>,
|
||||
id: ClientID,
|
||||
client: Option<page_api::Client>, // Some until dropped
|
||||
channel_guard: Option<ChannelGuard>, // Some until dropped
|
||||
permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
|
||||
}
|
||||
|
||||
impl Deref for ClientGuard {
|
||||
type Target = page_api::Client;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.client.as_ref().expect("not dropped")
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for ClientGuard {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.client.as_mut().expect("not dropped")
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the client to the pool.
|
||||
impl Drop for ClientGuard {
|
||||
fn drop(&mut self) {
|
||||
let Some(pool) = self.pool.upgrade() else {
|
||||
return; // pool was dropped
|
||||
};
|
||||
let entry = ClientEntry {
|
||||
client: self.client.take().expect("dropped once"),
|
||||
channel_guard: self.channel_guard.take().expect("dropped once"),
|
||||
};
|
||||
pool.idle.lock().unwrap().insert(self.id, entry);
|
||||
|
||||
_ = self.permit; // returned on drop, referenced for visibility
|
||||
}
|
||||
}
|
||||
|
||||
/// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
|
||||
/// acquires a client from the inner `ClientPool` for the stream's lifetime.
|
||||
///
|
||||
/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
|
||||
/// a single request and await the response. Internally, requests are multiplexed across streams and
|
||||
/// channels. This allows proper queue depth enforcement and response routing.
|
||||
///
|
||||
/// TODO: reap idle streams.
|
||||
/// TODO: consider making this generic over request and response types; not currently needed.
|
||||
pub struct StreamPool {
|
||||
/// The client pool to acquire clients from. Must be unbounded.
|
||||
client_pool: Arc<ClientPool>,
|
||||
/// All pooled streams.
|
||||
///
|
||||
/// Incoming requests will be sent over an existing stream with available capacity. If all
|
||||
/// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
|
||||
/// stream has an associated Tokio task that processes requests and responses.
|
||||
streams: Arc<Mutex<HashMap<StreamID, StreamEntry>>>,
|
||||
/// The max number of concurrent streams, or None if unbounded.
|
||||
max_streams: Option<NonZero<usize>>,
|
||||
/// The max number of concurrent requests per stream.
|
||||
max_queue_depth: NonZero<usize>,
|
||||
/// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
|
||||
/// None if the pool is unbounded.
|
||||
limiter: Option<Arc<Semaphore>>,
|
||||
/// Stream ID generator.
|
||||
next_stream_id: AtomicUsize,
|
||||
}
|
||||
|
||||
type StreamID = usize;
|
||||
type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
|
||||
type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
|
||||
type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
|
||||
|
||||
struct StreamEntry {
|
||||
/// Sends caller requests to the stream task. The stream task exits when this is dropped.
|
||||
sender: RequestSender,
|
||||
/// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on
|
||||
/// completion without acquiring the `StreamPool::streams` lock.
|
||||
queue_depth: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
impl StreamPool {
|
||||
/// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth`
|
||||
/// concurrent requests on each stream, and use up to `max_streams` concurrent streams.
|
||||
///
|
||||
/// The client pool must be unbounded. The stream pool will enforce its own limits, and because
|
||||
/// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
|
||||
/// The stream pool should generally have its own dedicated client pool (but it can share a
|
||||
/// channel pool with others since these are always unbounded).
|
||||
pub fn new(
|
||||
client_pool: Arc<ClientPool>,
|
||||
max_streams: Option<NonZero<usize>>,
|
||||
max_queue_depth: NonZero<usize>,
|
||||
) -> Arc<Self> {
|
||||
assert!(client_pool.limiter.is_none(), "bounded client pool");
|
||||
Arc::new(Self {
|
||||
client_pool,
|
||||
streams: Arc::default(),
|
||||
limiter: max_streams.map(|max_streams| {
|
||||
Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
|
||||
}),
|
||||
max_streams,
|
||||
max_queue_depth,
|
||||
next_stream_id: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Acquires an available stream from the pool, or spins up a new stream async if all streams
|
||||
/// are full. Returns a guard that can be used to send a single request on the stream and await
|
||||
/// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
|
||||
/// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
|
||||
///
|
||||
/// This is very performance-sensitive, as it is on the GetPage hot path.
|
||||
///
|
||||
/// TODO: this must do something more sophisticated for performance. We want:
|
||||
///
|
||||
/// * Cheap, concurrent access in the common case where we can use a pooled stream.
|
||||
/// * Quick acquisition of pooled streams with available capacity.
|
||||
/// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
|
||||
/// * Prefer filling up existing streams' queue depth before spinning up new streams.
|
||||
/// * Don't hold a lock while spinning up new streams.
|
||||
/// * Allow concurrent clients to join onto streams while they're spun up.
|
||||
/// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
|
||||
///
|
||||
/// For now, we just do something simple and functional, but very inefficient (linear scan).
|
||||
pub async fn get(&self) -> StreamGuard {
|
||||
// Acquire a permit if the pool is bounded.
|
||||
let mut permit = None;
|
||||
if let Some(limiter) = self.limiter.clone() {
|
||||
permit = Some(limiter.acquire_owned().await.expect("never closed"));
|
||||
}
|
||||
let mut streams = self.streams.lock().unwrap();
|
||||
|
||||
// Look for a pooled stream with available capacity.
|
||||
for entry in streams.values() {
|
||||
assert!(
|
||||
entry.queue_depth.load(Ordering::Relaxed) <= self.max_queue_depth.get(),
|
||||
"stream queue overflow"
|
||||
);
|
||||
if entry
|
||||
.queue_depth
|
||||
.fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| {
|
||||
// Increment the queue depth via compare-and-swap.
|
||||
// TODO: review ordering.
|
||||
(queue_depth < self.max_queue_depth.get()).then_some(queue_depth + 1)
|
||||
})
|
||||
.is_ok()
|
||||
{
|
||||
return StreamGuard {
|
||||
sender: entry.sender.clone(),
|
||||
queue_depth: entry.queue_depth.clone(),
|
||||
permit,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// No available stream, spin up a new one. We install the stream entry in the pool first and
|
||||
// return the guard, while spinning up the stream task async. This allows other callers to
|
||||
// join onto this stream and also create additional streams concurrently if this fills up.
|
||||
let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
|
||||
let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller
|
||||
let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
|
||||
let entry = StreamEntry {
|
||||
sender: req_tx.clone(),
|
||||
queue_depth: queue_depth.clone(),
|
||||
};
|
||||
streams.insert(id, entry);
|
||||
|
||||
if let Some(max_streams) = self.max_streams {
|
||||
assert!(streams.len() <= max_streams.get(), "stream overflow");
|
||||
};
|
||||
|
||||
let client_pool = self.client_pool.clone();
|
||||
let streams = self.streams.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = Self::run_stream(client_pool, req_rx).await {
|
||||
error!("stream failed: {err}");
|
||||
}
|
||||
// Remove stream from pool on exit.
|
||||
let entry = streams.lock().unwrap().remove(&id);
|
||||
assert!(entry.is_some(), "unknown stream ID: {id}");
|
||||
});
|
||||
|
||||
StreamGuard {
|
||||
sender: req_tx,
|
||||
queue_depth,
|
||||
permit,
|
||||
}
|
||||
}
|
||||
|
||||
/// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
|
||||
/// bidirectional GetPage stream, then forwards requests and responses between callers and the
|
||||
/// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
|
||||
/// atomic with pool stream acquisition.
|
||||
///
|
||||
/// The task exits when the request channel is closed, or on a stream error. The caller is
|
||||
/// responsible for removing the stream from the pool on exit.
|
||||
async fn run_stream(
|
||||
client_pool: Arc<ClientPool>,
|
||||
mut caller_rx: RequestReceiver,
|
||||
) -> anyhow::Result<()> {
|
||||
// Acquire a client from the pool and create a stream.
|
||||
let mut client = client_pool.get().await?;
|
||||
|
||||
// NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could
|
||||
// theoretically deadlock if both the client and server block on sends (since we're not
|
||||
// reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and
|
||||
// low queue depths, but it was seen to happen with the libpq protocol so better safe than
|
||||
// sorry. It should never buffer more than the queue depth anyway, but using an unbounded
|
||||
// channel guarantees that it will never block.
|
||||
let (req_tx, req_rx) = mpsc::unbounded_channel();
|
||||
let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
|
||||
let mut resp_stream = client.get_pages(req_stream).await?;
|
||||
|
||||
// Track caller response channels by request ID. If the task returns early, these response
|
||||
// channels will be dropped and the waiting callers will receive an error.
|
||||
let mut callers = HashMap::new();
|
||||
|
||||
// Process requests and responses.
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Receive requests from callers and send them to the stream.
|
||||
req = caller_rx.recv() => {
|
||||
// Shut down if request channel is closed.
|
||||
let Some((req, resp_tx)) = req else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
// Store the response channel by request ID.
|
||||
if callers.contains_key(&req.request_id) {
|
||||
// Error on request ID duplicates. Ignore callers that went away.
|
||||
_ = resp_tx.send(Err(tonic::Status::invalid_argument(
|
||||
format!("duplicate request ID: {}", req.request_id),
|
||||
)));
|
||||
continue;
|
||||
}
|
||||
callers.insert(req.request_id, resp_tx);
|
||||
|
||||
// Send the request on the stream. Bail out if the stream is closed.
|
||||
req_tx.send(req).map_err(|_| {
|
||||
tonic::Status::unavailable("stream closed")
|
||||
})?;
|
||||
}
|
||||
|
||||
// Receive responses from the stream and send them to callers.
|
||||
resp = resp_stream.next() => {
|
||||
// Shut down if the stream is closed, and bail out on stream errors.
|
||||
let Some(resp) = resp.transpose()? else {
|
||||
return Ok(())
|
||||
};
|
||||
|
||||
// Send the response to the caller. Ignore errors if the caller went away.
|
||||
let Some(resp_tx) = callers.remove(&resp.request_id) else {
|
||||
warn!("received response for unknown request ID: {}", resp.request_id);
|
||||
continue;
|
||||
};
|
||||
_ = resp_tx.send(Ok(resp));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
|
||||
/// depth. Queue depth is already reserved and will be returned on drop.
|
||||
pub struct StreamGuard {
|
||||
sender: RequestSender,
|
||||
queue_depth: Arc<AtomicUsize>,
|
||||
permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
|
||||
}
|
||||
|
||||
impl StreamGuard {
|
||||
/// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
|
||||
/// valid for a single request (to enforce queue depth). This also drops the guard on return and
|
||||
/// returns the queue depth quota to the pool.
|
||||
///
|
||||
/// The `GetPageRequest::request_id` must be unique across in-flight requests.
|
||||
///
|
||||
/// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
|
||||
/// to avoid tearing down the stream for per-request errors. Callers must check this.
|
||||
pub async fn send(
|
||||
self,
|
||||
req: page_api::GetPageRequest,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
let (resp_tx, resp_rx) = oneshot::channel();
|
||||
|
||||
self.sender
|
||||
.send((req, resp_tx))
|
||||
.await
|
||||
.map_err(|_| tonic::Status::unavailable("stream closed"))?;
|
||||
|
||||
resp_rx
|
||||
.await
|
||||
.map_err(|_| tonic::Status::unavailable("stream closed"))?
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for StreamGuard {
|
||||
fn drop(&mut self) {
|
||||
// Release the queue depth reservation on drop. This can prematurely decrement it if dropped
|
||||
// before the response is received, but that's okay.
|
||||
let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst);
|
||||
assert!(prev_queue_depth > 0, "stream queue underflow");
|
||||
|
||||
_ = self.permit; // returned on drop, referenced for visibility
|
||||
}
|
||||
}
|
||||
151
pageserver/client_grpc/src/retry.rs
Normal file
151
pageserver/client_grpc/src/retry.rs
Normal file
@@ -0,0 +1,151 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::time::Instant;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use utils::backoff::exponential_backoff_duration;
|
||||
|
||||
/// A retry handler for Pageserver gRPC requests.
|
||||
///
|
||||
/// This is used instead of backoff::retry for better control and observability.
|
||||
pub struct Retry;
|
||||
|
||||
impl Retry {
|
||||
/// The per-request timeout.
|
||||
// TODO: tune these, and/or make them configurable. Should we retry forever?
|
||||
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
/// The total timeout across all attempts
|
||||
const TOTAL_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
/// The initial backoff duration.
|
||||
const BASE_BACKOFF: Duration = Duration::from_millis(10);
|
||||
/// The maximum backoff duration.
|
||||
const MAX_BACKOFF: Duration = Duration::from_secs(10);
|
||||
/// If true, log successful requests. For debugging.
|
||||
const LOG_SUCCESS: bool = false;
|
||||
|
||||
/// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
|
||||
/// using the current tracing span for context.
|
||||
///
|
||||
/// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
|
||||
/// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
|
||||
pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
|
||||
where
|
||||
F: FnMut() -> O,
|
||||
O: Future<Output = tonic::Result<T>>,
|
||||
{
|
||||
let started = Instant::now();
|
||||
let deadline = started + Self::TOTAL_TIMEOUT;
|
||||
let mut last_error = None;
|
||||
let mut retries = 0;
|
||||
loop {
|
||||
// Set up a future to wait for the backoff (if any) and run the request with a timeout.
|
||||
let backoff_and_try = async {
|
||||
// NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
|
||||
// https://github.com/tokio-rs/tokio/issues/6866
|
||||
if let Some(backoff) = Self::backoff_duration(retries) {
|
||||
tokio::time::sleep(backoff).await;
|
||||
}
|
||||
|
||||
let request_started = Instant::now();
|
||||
tokio::time::timeout(Self::REQUEST_TIMEOUT, f())
|
||||
.await
|
||||
.map_err(|_| {
|
||||
tonic::Status::deadline_exceeded(format!(
|
||||
"request timed out after {:.3}s",
|
||||
request_started.elapsed().as_secs_f64()
|
||||
))
|
||||
})?
|
||||
};
|
||||
|
||||
// Wait for the backoff and request, or bail out if the total timeout is exceeded.
|
||||
let result = tokio::select! {
|
||||
result = backoff_and_try => result,
|
||||
|
||||
_ = tokio::time::sleep_until(deadline) => {
|
||||
let last_error = last_error.unwrap_or_else(|| {
|
||||
tonic::Status::deadline_exceeded(format!(
|
||||
"request timed out after {:.3}s",
|
||||
started.elapsed().as_secs_f64()
|
||||
))
|
||||
});
|
||||
error!(
|
||||
"giving up after {:.3}s and {retries} retries, last error {:?}: {}",
|
||||
started.elapsed().as_secs_f64(), last_error.code(), last_error.message(),
|
||||
);
|
||||
return Err(last_error);
|
||||
}
|
||||
};
|
||||
|
||||
match result {
|
||||
// Success, return the result.
|
||||
Ok(result) => {
|
||||
if retries > 0 || Self::LOG_SUCCESS {
|
||||
info!(
|
||||
"request succeeded after {retries} retries in {:.3}s",
|
||||
started.elapsed().as_secs_f64(),
|
||||
);
|
||||
}
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
// Error, retry or bail out.
|
||||
Err(status) => {
|
||||
let (code, message) = (status.code(), status.message());
|
||||
let attempt = retries + 1;
|
||||
|
||||
if !Self::should_retry(code) {
|
||||
// NB: include the attempt here too. This isn't necessarily the first
|
||||
// attempt, because the error may change between attempts.
|
||||
error!(
|
||||
"request failed with {code:?}: {message}, not retrying (attempt {attempt})"
|
||||
);
|
||||
return Err(status);
|
||||
}
|
||||
|
||||
warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})");
|
||||
|
||||
retries += 1;
|
||||
last_error = Some(status);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the backoff duration for the given retry attempt, or None for no backoff.
|
||||
fn backoff_duration(retry: usize) -> Option<Duration> {
|
||||
let backoff = exponential_backoff_duration(
|
||||
retry as u32,
|
||||
Self::BASE_BACKOFF.as_secs_f64(),
|
||||
Self::MAX_BACKOFF.as_secs_f64(),
|
||||
);
|
||||
(!backoff.is_zero()).then_some(backoff)
|
||||
}
|
||||
|
||||
/// Returns true if the given status code should be retries.
|
||||
fn should_retry(code: tonic::Code) -> bool {
|
||||
match code {
|
||||
tonic::Code::Ok => panic!("unexpected Ok status code"),
|
||||
|
||||
// These codes are transient, so retry them.
|
||||
tonic::Code::Aborted => true,
|
||||
tonic::Code::Cancelled => true,
|
||||
tonic::Code::DeadlineExceeded => true, // maybe transient slowness
|
||||
tonic::Code::Internal => true, // maybe transient failure?
|
||||
tonic::Code::ResourceExhausted => true,
|
||||
tonic::Code::Unavailable => true,
|
||||
|
||||
// The following codes will like continue to fail, so don't retry.
|
||||
tonic::Code::AlreadyExists => false,
|
||||
tonic::Code::DataLoss => false,
|
||||
tonic::Code::FailedPrecondition => false,
|
||||
tonic::Code::InvalidArgument => false,
|
||||
tonic::Code::NotFound => false,
|
||||
tonic::Code::OutOfRange => false,
|
||||
tonic::Code::PermissionDenied => false,
|
||||
tonic::Code::Unauthenticated => false,
|
||||
tonic::Code::Unimplemented => false,
|
||||
tonic::Code::Unknown => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
172
pageserver/client_grpc/src/split.rs
Normal file
172
pageserver/client_grpc/src/split.rs
Normal file
@@ -0,0 +1,172 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use bytes::Bytes;
|
||||
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::shard::{ShardCount, ShardIndex};
|
||||
|
||||
/// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
|
||||
/// TODO: add tests for this.
|
||||
pub struct GetPageSplitter {
|
||||
/// The original request ID. Used for all shard requests.
|
||||
request_id: page_api::RequestID,
|
||||
/// Split requests by shard index.
|
||||
requests: HashMap<ShardIndex, page_api::GetPageRequest>,
|
||||
/// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble
|
||||
/// the response pages in the same order as the original request.
|
||||
block_shards: Vec<ShardIndex>,
|
||||
/// Page responses by shard index. Will be assembled into a single response.
|
||||
responses: HashMap<ShardIndex, Vec<Bytes>>,
|
||||
}
|
||||
|
||||
impl GetPageSplitter {
|
||||
/// Checks if the given request only touches a single shard, and returns the shard ID. This is
|
||||
/// the common case, so we check first in order to avoid unnecessary allocations and overhead.
|
||||
/// The caller must ensure that the request has at least one block number, or this will panic.
|
||||
pub fn is_single_shard(
|
||||
req: &page_api::GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
) -> Option<ShardIndex> {
|
||||
// Fast path: unsharded tenant.
|
||||
if count.is_unsharded() {
|
||||
return Some(ShardIndex::unsharded());
|
||||
}
|
||||
|
||||
// Find the base shard index for the first page, and compare with the rest.
|
||||
let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages"));
|
||||
let shard_number = key_to_shard_number(count, stripe_size, &key);
|
||||
|
||||
req.block_numbers
|
||||
.iter()
|
||||
.skip(1) // computed above
|
||||
.all(|&blkno| {
|
||||
let key = rel_block_to_key(req.rel, blkno);
|
||||
key_to_shard_number(count, stripe_size, &key) == shard_number
|
||||
})
|
||||
.then_some(ShardIndex::new(shard_number, count))
|
||||
}
|
||||
|
||||
/// Splits the given request.
|
||||
pub fn split(
|
||||
req: page_api::GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
) -> Self {
|
||||
// The caller should make sure we don't split requests unnecessarily.
|
||||
debug_assert!(
|
||||
Self::is_single_shard(&req, count, stripe_size).is_none(),
|
||||
"unnecessary request split"
|
||||
);
|
||||
|
||||
// Split the requests by shard index.
|
||||
let mut requests = HashMap::with_capacity(2); // common case
|
||||
let mut block_shards = Vec::with_capacity(req.block_numbers.len());
|
||||
for blkno in req.block_numbers {
|
||||
let key = rel_block_to_key(req.rel, blkno);
|
||||
let shard_number = key_to_shard_number(count, stripe_size, &key);
|
||||
let shard_id = ShardIndex::new(shard_number, count);
|
||||
|
||||
let shard_req = requests
|
||||
.entry(shard_id)
|
||||
.or_insert_with(|| page_api::GetPageRequest {
|
||||
request_id: req.request_id,
|
||||
request_class: req.request_class,
|
||||
rel: req.rel,
|
||||
read_lsn: req.read_lsn,
|
||||
block_numbers: Vec::new(),
|
||||
});
|
||||
shard_req.block_numbers.push(blkno);
|
||||
block_shards.push(shard_id);
|
||||
}
|
||||
|
||||
Self {
|
||||
request_id: req.request_id,
|
||||
responses: HashMap::with_capacity(requests.len()),
|
||||
requests,
|
||||
block_shards,
|
||||
}
|
||||
}
|
||||
|
||||
/// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations.
|
||||
pub fn drain_requests(
|
||||
&mut self,
|
||||
) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
|
||||
self.requests.drain()
|
||||
}
|
||||
|
||||
/// Adds a response from the given shard.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn add_response(
|
||||
&mut self,
|
||||
shard_id: ShardIndex,
|
||||
response: page_api::GetPageResponse,
|
||||
) -> tonic::Result<()> {
|
||||
// The caller should already have converted status codes into tonic::Status.
|
||||
assert_eq!(response.status_code, page_api::GetPageStatusCode::Ok);
|
||||
|
||||
// Make sure the response matches the request ID.
|
||||
if response.request_id != self.request_id {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"response ID {} does not match request ID {}",
|
||||
response.request_id, self.request_id
|
||||
)));
|
||||
}
|
||||
|
||||
// Add the response data to the map.
|
||||
let old = self.responses.insert(shard_id, response.page_images);
|
||||
|
||||
if old.is_some() {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"duplicate response for shard {shard_id}",
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Assembles the shard responses into a single response. Responses must be present for all
|
||||
/// relevant shards, and the total number of pages must match the original request.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn assemble_response(self) -> tonic::Result<page_api::GetPageResponse> {
|
||||
let mut response = page_api::GetPageResponse {
|
||||
request_id: self.request_id,
|
||||
status_code: page_api::GetPageStatusCode::Ok,
|
||||
reason: None,
|
||||
page_images: Vec::with_capacity(self.block_shards.len()),
|
||||
};
|
||||
|
||||
// Set up per-shard page iterators we can pull from.
|
||||
let mut shard_responses = HashMap::with_capacity(self.responses.len());
|
||||
for (shard_id, responses) in self.responses {
|
||||
shard_responses.insert(shard_id, responses.into_iter());
|
||||
}
|
||||
|
||||
// Reassemble the responses in the same order as the original request.
|
||||
for shard_id in &self.block_shards {
|
||||
let page = shard_responses
|
||||
.get_mut(shard_id)
|
||||
.ok_or_else(|| {
|
||||
tonic::Status::internal(format!("missing response for shard {shard_id}"))
|
||||
})?
|
||||
.next()
|
||||
.ok_or_else(|| {
|
||||
tonic::Status::internal(format!("missing page from shard {shard_id}"))
|
||||
})?;
|
||||
response.page_images.push(page);
|
||||
}
|
||||
|
||||
// Make sure there are no additional pages.
|
||||
for (shard_id, mut pages) in shard_responses {
|
||||
if pages.next().is_some() {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"extra pages returned from shard {shard_id}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,101 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pageserver::tenant::{
|
||||
IndexPart,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
remote_timeline_client::remote_layer_path,
|
||||
storage_layer::{PersistentLayerDesc, ReadableLayerWeak},
|
||||
};
|
||||
use pageserver_api::key::Key;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
shard::TenantShardId,
|
||||
};
|
||||
|
||||
#[derive(clap::Subcommand)]
|
||||
pub(crate) enum IndexPartCmd {
|
||||
Dump { path: Utf8PathBuf },
|
||||
Dump {
|
||||
path: Utf8PathBuf,
|
||||
},
|
||||
/// Find all layers that need to be searched to construct the given page at the given LSN.
|
||||
Search {
|
||||
#[arg(long)]
|
||||
tenant_id: String,
|
||||
#[arg(long)]
|
||||
timeline_id: String,
|
||||
#[arg(long)]
|
||||
path: Utf8PathBuf,
|
||||
#[arg(long)]
|
||||
key: String,
|
||||
#[arg(long)]
|
||||
lsn: String,
|
||||
},
|
||||
}
|
||||
|
||||
async fn search_layers(
|
||||
tenant_id: &str,
|
||||
timeline_id: &str,
|
||||
path: &Utf8PathBuf,
|
||||
key: &str,
|
||||
lsn: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenant_id = TenantId::from_str(tenant_id).unwrap();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let timeline_id = TimelineId::from_str(timeline_id).unwrap();
|
||||
let index_json = {
|
||||
let bytes = tokio::fs::read(path).await?;
|
||||
IndexPart::from_json_bytes(&bytes).unwrap()
|
||||
};
|
||||
let mut layer_map = LayerMap::default();
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
for (key, value) in index_json.layer_metadata.iter() {
|
||||
updates.insert_historic(PersistentLayerDesc::from_filename(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
key.clone(),
|
||||
value.file_size,
|
||||
));
|
||||
}
|
||||
}
|
||||
let key = Key::from_hex(key)?;
|
||||
|
||||
let lsn = Lsn::from_str(lsn).unwrap();
|
||||
let mut end_lsn = lsn;
|
||||
loop {
|
||||
let result = layer_map.search(key, end_lsn);
|
||||
match result {
|
||||
Some(SearchResult { layer, lsn_floor }) => {
|
||||
let disk_layer = match layer {
|
||||
ReadableLayerWeak::PersistentLayer(layer) => layer,
|
||||
ReadableLayerWeak::InMemoryLayer(_) => {
|
||||
anyhow::bail!("unexpected in-memory layer")
|
||||
}
|
||||
};
|
||||
|
||||
let metadata = index_json
|
||||
.layer_metadata
|
||||
.get(&disk_layer.layer_name())
|
||||
.unwrap();
|
||||
println!(
|
||||
"{}",
|
||||
remote_layer_path(
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
metadata.shard,
|
||||
&disk_layer.layer_name(),
|
||||
metadata.generation
|
||||
)
|
||||
);
|
||||
end_lsn = lsn_floor;
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
@@ -16,5 +107,12 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
println!("{output}");
|
||||
Ok(())
|
||||
}
|
||||
IndexPartCmd::Search {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
path,
|
||||
key,
|
||||
lsn,
|
||||
} => search_layers(tenant_id, timeline_id, path, key, lsn).await,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,23 +1,151 @@
|
||||
use anyhow::Result;
|
||||
use anyhow::Context as _;
|
||||
use futures::{Stream, StreamExt as _, TryStreamExt as _};
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tonic::codec::CompressionEncoding;
|
||||
use tonic::metadata::AsciiMetadataValue;
|
||||
use tonic::metadata::errors::InvalidMetadataValue;
|
||||
use tonic::transport::Channel;
|
||||
use tonic::{Request, Streaming};
|
||||
use tonic::service::Interceptor;
|
||||
use tonic::service::interceptor::InterceptedService;
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
|
||||
use utils::id::TenantId;
|
||||
use utils::id::TimelineId;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
use crate::model;
|
||||
use crate::model::*;
|
||||
use crate::proto;
|
||||
|
||||
///
|
||||
/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These
|
||||
/// headers are required at the pageserver.
|
||||
///
|
||||
/// A basic Pageserver gRPC client, for a single tenant shard. This API uses native Rust domain
|
||||
/// types from `model` rather than generated Protobuf types.
|
||||
pub struct Client {
|
||||
inner: proto::PageServiceClient<InterceptedService<Channel, AuthInterceptor>>,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
/// Connects to the given gRPC endpoint.
|
||||
pub async fn connect<E>(
|
||||
endpoint: E,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
) -> anyhow::Result<Self>
|
||||
where
|
||||
E: TryInto<Endpoint> + Send + Sync + 'static,
|
||||
<E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
|
||||
{
|
||||
let endpoint: Endpoint = endpoint.try_into().context("invalid endpoint")?;
|
||||
let channel = endpoint.connect().await?;
|
||||
Self::new(
|
||||
channel,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token,
|
||||
compression,
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates a new client using the given gRPC channel.
|
||||
pub fn new(
|
||||
channel: Channel,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let auth = AuthInterceptor::new(tenant_id, timeline_id, shard_id, auth_token)?;
|
||||
let mut inner = proto::PageServiceClient::with_interceptor(channel, auth);
|
||||
|
||||
if let Some(compression) = compression {
|
||||
// TODO: benchmark this (including network latency).
|
||||
inner = inner
|
||||
.accept_compressed(compression)
|
||||
.send_compressed(compression);
|
||||
}
|
||||
|
||||
Ok(Self { inner })
|
||||
}
|
||||
|
||||
/// Returns whether a relation exists.
|
||||
pub async fn check_rel_exists(
|
||||
&mut self,
|
||||
req: CheckRelExistsRequest,
|
||||
) -> tonic::Result<CheckRelExistsResponse> {
|
||||
let req = proto::CheckRelExistsRequest::from(req);
|
||||
let resp = self.inner.check_rel_exists(req).await?.into_inner();
|
||||
Ok(resp.into())
|
||||
}
|
||||
|
||||
/// Fetches a base backup.
|
||||
pub async fn get_base_backup(
|
||||
&mut self,
|
||||
req: GetBaseBackupRequest,
|
||||
) -> tonic::Result<impl AsyncRead + use<>> {
|
||||
let req = proto::GetBaseBackupRequest::from(req);
|
||||
let chunks = self.inner.get_base_backup(req).await?.into_inner();
|
||||
Ok(StreamReader::new(
|
||||
chunks
|
||||
.map_ok(|resp| resp.chunk)
|
||||
.map_err(std::io::Error::other),
|
||||
))
|
||||
}
|
||||
|
||||
/// Returns the total size of a database, as # of bytes.
|
||||
pub async fn get_db_size(&mut self, req: GetDbSizeRequest) -> tonic::Result<GetDbSizeResponse> {
|
||||
let req = proto::GetDbSizeRequest::from(req);
|
||||
let resp = self.inner.get_db_size(req).await?.into_inner();
|
||||
Ok(resp.into())
|
||||
}
|
||||
|
||||
/// Fetches pages.
|
||||
///
|
||||
/// This is implemented as a bidirectional streaming RPC for performance. Per-request errors are
|
||||
/// typically returned as status_code instead of errors, to avoid tearing down the entire stream
|
||||
/// via a tonic::Status error.
|
||||
pub async fn get_pages(
|
||||
&mut self,
|
||||
reqs: impl Stream<Item = GetPageRequest> + Send + 'static,
|
||||
) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
|
||||
let reqs = reqs.map(proto::GetPageRequest::from);
|
||||
let resps = self.inner.get_pages(reqs).await?.into_inner();
|
||||
Ok(resps.map_ok(GetPageResponse::from))
|
||||
}
|
||||
|
||||
/// Returns the size of a relation, as # of blocks.
|
||||
pub async fn get_rel_size(
|
||||
&mut self,
|
||||
req: GetRelSizeRequest,
|
||||
) -> tonic::Result<GetRelSizeResponse> {
|
||||
let req = proto::GetRelSizeRequest::from(req);
|
||||
let resp = self.inner.get_rel_size(req).await?.into_inner();
|
||||
Ok(resp.into())
|
||||
}
|
||||
|
||||
/// Fetches an SLRU segment.
|
||||
pub async fn get_slru_segment(
|
||||
&mut self,
|
||||
req: GetSlruSegmentRequest,
|
||||
) -> tonic::Result<GetSlruSegmentResponse> {
|
||||
let req = proto::GetSlruSegmentRequest::from(req);
|
||||
let resp = self.inner.get_slru_segment(req).await?.into_inner();
|
||||
Ok(resp.try_into()?)
|
||||
}
|
||||
|
||||
/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
|
||||
/// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
|
||||
///
|
||||
/// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
|
||||
/// acquired because the LSN has already been garbage collected.
|
||||
pub async fn lease_lsn(&mut self, req: LeaseLsnRequest) -> tonic::Result<LeaseLsnResponse> {
|
||||
let req = proto::LeaseLsnRequest::from(req);
|
||||
let resp = self.inner.lease_lsn(req).await?.into_inner();
|
||||
Ok(resp.try_into()?)
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds authentication metadata to gRPC requests.
|
||||
#[derive(Clone)]
|
||||
struct AuthInterceptor {
|
||||
tenant_id: AsciiMetadataValue,
|
||||
@@ -30,174 +158,29 @@ impl AuthInterceptor {
|
||||
fn new(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
auth_token: Option<String>,
|
||||
shard_id: ShardIndex,
|
||||
) -> Result<Self, InvalidMetadataValue> {
|
||||
let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?;
|
||||
let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?;
|
||||
let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?;
|
||||
|
||||
let auth_header: Option<AsciiMetadataValue> = match auth_token {
|
||||
Some(token) => Some(format!("Bearer {token}").try_into()?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
auth_token: Option<String>,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
tenant_id: tenant_ascii,
|
||||
shard_id: shard_ascii,
|
||||
timeline_id: timeline_ascii,
|
||||
auth_header,
|
||||
tenant_id: tenant_id.to_string().try_into()?,
|
||||
timeline_id: timeline_id.to_string().try_into()?,
|
||||
shard_id: shard_id.to_string().try_into()?,
|
||||
auth_header: auth_token
|
||||
.map(|token| format!("Bearer {token}").try_into())
|
||||
.transpose()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl tonic::service::Interceptor for AuthInterceptor {
|
||||
fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
|
||||
req.metadata_mut()
|
||||
.insert("neon-tenant-id", self.tenant_id.clone());
|
||||
req.metadata_mut()
|
||||
.insert("neon-shard-id", self.shard_id.clone());
|
||||
req.metadata_mut()
|
||||
.insert("neon-timeline-id", self.timeline_id.clone());
|
||||
if let Some(auth_header) = &self.auth_header {
|
||||
req.metadata_mut()
|
||||
.insert("authorization", auth_header.clone());
|
||||
impl Interceptor for AuthInterceptor {
|
||||
fn call(&mut self, mut req: tonic::Request<()>) -> tonic::Result<tonic::Request<()>> {
|
||||
let metadata = req.metadata_mut();
|
||||
metadata.insert("neon-tenant-id", self.tenant_id.clone());
|
||||
metadata.insert("neon-timeline-id", self.timeline_id.clone());
|
||||
metadata.insert("neon-shard-id", self.shard_id.clone());
|
||||
if let Some(ref auth_header) = self.auth_header {
|
||||
metadata.insert("authorization", auth_header.clone());
|
||||
}
|
||||
Ok(req)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Client {
|
||||
client: proto::PageServiceClient<
|
||||
tonic::service::interceptor::InterceptedService<Channel, AuthInterceptor>,
|
||||
>,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub async fn new<T: TryInto<tonic::transport::Endpoint> + Send + Sync + 'static>(
|
||||
into_endpoint: T,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_header: Option<String>,
|
||||
compression: Option<tonic::codec::CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let endpoint: tonic::transport::Endpoint = into_endpoint
|
||||
.try_into()
|
||||
.map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?;
|
||||
let channel = endpoint.connect().await?;
|
||||
let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id)
|
||||
.map_err(|e| anyhow::anyhow!(e.to_string()))?;
|
||||
let mut client = proto::PageServiceClient::with_interceptor(channel, auth);
|
||||
|
||||
if let Some(compression) = compression {
|
||||
// TODO: benchmark this (including network latency).
|
||||
client = client
|
||||
.accept_compressed(compression)
|
||||
.send_compressed(compression);
|
||||
}
|
||||
|
||||
Ok(Self { client })
|
||||
}
|
||||
|
||||
/// Returns whether a relation exists.
|
||||
pub async fn check_rel_exists(
|
||||
&mut self,
|
||||
req: model::CheckRelExistsRequest,
|
||||
) -> Result<model::CheckRelExistsResponse, tonic::Status> {
|
||||
let proto_req = proto::CheckRelExistsRequest::from(req);
|
||||
|
||||
let response = self.client.check_rel_exists(proto_req).await?;
|
||||
|
||||
let proto_resp = response.into_inner();
|
||||
Ok(proto_resp.into())
|
||||
}
|
||||
|
||||
/// Fetches a base backup.
|
||||
pub async fn get_base_backup(
|
||||
&mut self,
|
||||
req: model::GetBaseBackupRequest,
|
||||
) -> Result<impl AsyncRead + use<>, tonic::Status> {
|
||||
let req = proto::GetBaseBackupRequest::from(req);
|
||||
let chunks = self.client.get_base_backup(req).await?.into_inner();
|
||||
let reader = StreamReader::new(
|
||||
chunks
|
||||
.map_ok(|resp| resp.chunk)
|
||||
.map_err(std::io::Error::other),
|
||||
);
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
/// Returns the total size of a database, as # of bytes.
|
||||
pub async fn get_db_size(
|
||||
&mut self,
|
||||
req: model::GetDbSizeRequest,
|
||||
) -> Result<u64, tonic::Status> {
|
||||
let proto_req = proto::GetDbSizeRequest::from(req);
|
||||
|
||||
let response = self.client.get_db_size(proto_req).await?;
|
||||
Ok(response.into_inner().into())
|
||||
}
|
||||
|
||||
/// Fetches pages.
|
||||
///
|
||||
/// This is implemented as a bidirectional streaming RPC for performance.
|
||||
/// Per-request errors are often returned as status_code instead of errors,
|
||||
/// to avoid tearing down the entire stream via tonic::Status.
|
||||
pub async fn get_pages<ReqSt>(
|
||||
&mut self,
|
||||
inbound: ReqSt,
|
||||
) -> Result<
|
||||
impl Stream<Item = Result<model::GetPageResponse, tonic::Status>> + Send + 'static,
|
||||
tonic::Status,
|
||||
>
|
||||
where
|
||||
ReqSt: Stream<Item = model::GetPageRequest> + Send + 'static,
|
||||
{
|
||||
let outbound_proto = inbound.map(|domain_req| domain_req.into());
|
||||
|
||||
let req_new = Request::new(outbound_proto);
|
||||
|
||||
let response_stream: Streaming<proto::GetPageResponse> =
|
||||
self.client.get_pages(req_new).await?.into_inner();
|
||||
|
||||
let domain_stream = response_stream.map_ok(model::GetPageResponse::from);
|
||||
|
||||
Ok(domain_stream)
|
||||
}
|
||||
|
||||
/// Returns the size of a relation, as # of blocks.
|
||||
pub async fn get_rel_size(
|
||||
&mut self,
|
||||
req: model::GetRelSizeRequest,
|
||||
) -> Result<model::GetRelSizeResponse, tonic::Status> {
|
||||
let proto_req = proto::GetRelSizeRequest::from(req);
|
||||
let response = self.client.get_rel_size(proto_req).await?;
|
||||
let proto_resp = response.into_inner();
|
||||
Ok(proto_resp.into())
|
||||
}
|
||||
|
||||
/// Fetches an SLRU segment.
|
||||
pub async fn get_slru_segment(
|
||||
&mut self,
|
||||
req: model::GetSlruSegmentRequest,
|
||||
) -> Result<model::GetSlruSegmentResponse, tonic::Status> {
|
||||
let proto_req = proto::GetSlruSegmentRequest::from(req);
|
||||
let response = self.client.get_slru_segment(proto_req).await?;
|
||||
Ok(response.into_inner().try_into()?)
|
||||
}
|
||||
|
||||
/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
|
||||
/// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
|
||||
///
|
||||
/// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
|
||||
/// acquired because the LSN has already been garbage collected.
|
||||
pub async fn lease_lsn(
|
||||
&mut self,
|
||||
req: model::LeaseLsnRequest,
|
||||
) -> Result<model::LeaseLsnResponse, tonic::Status> {
|
||||
let req = proto::LeaseLsnRequest::from(req);
|
||||
Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -384,7 +384,7 @@ impl From<GetPageRequest> for proto::GetPageRequest {
|
||||
pub type RequestID = u64;
|
||||
|
||||
/// A GetPage request class.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
#[derive(Clone, Copy, Debug, strum_macros::Display)]
|
||||
pub enum GetPageClass {
|
||||
/// Unknown class. For backwards compatibility: used when an older client version sends a class
|
||||
/// that a newer server version has removed.
|
||||
@@ -397,6 +397,19 @@ pub enum GetPageClass {
|
||||
Background,
|
||||
}
|
||||
|
||||
impl GetPageClass {
|
||||
/// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than
|
||||
/// latency-sensitive).
|
||||
pub fn is_bulk(&self) -> bool {
|
||||
match self {
|
||||
Self::Unknown => false,
|
||||
Self::Normal => false,
|
||||
Self::Prefetch => true,
|
||||
Self::Background => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<proto::GetPageClass> for GetPageClass {
|
||||
fn from(pb: proto::GetPageClass) -> Self {
|
||||
match pb {
|
||||
@@ -602,6 +615,21 @@ impl TryFrom<tonic::Code> for GetPageStatusCode {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetPageStatusCode> for tonic::Code {
|
||||
fn from(status_code: GetPageStatusCode) -> Self {
|
||||
use tonic::Code;
|
||||
|
||||
match status_code {
|
||||
GetPageStatusCode::Unknown => Code::Unknown,
|
||||
GetPageStatusCode::Ok => Code::Ok,
|
||||
GetPageStatusCode::NotFound => Code::NotFound,
|
||||
GetPageStatusCode::InvalidRequest => Code::InvalidArgument,
|
||||
GetPageStatusCode::InternalError => Code::Internal,
|
||||
GetPageStatusCode::SlowDown => Code::ResourceExhausted,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
|
||||
// shards will error.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
|
||||
@@ -326,7 +326,7 @@ impl GrpcClient {
|
||||
ttid: TenantTimelineId,
|
||||
compression: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
let inner = page_api::Client::new(
|
||||
let inner = page_api::Client::connect(
|
||||
connstring.to_string(),
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
|
||||
@@ -625,7 +625,7 @@ impl GrpcClient {
|
||||
ttid: TenantTimelineId,
|
||||
compression: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut client = page_api::Client::new(
|
||||
let mut client = page_api::Client::connect(
|
||||
connstring.to_string(),
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
|
||||
@@ -28,7 +28,6 @@ use reqwest::Url;
|
||||
use storage_broker::Uri;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::logging::{LogFormat, SecretString};
|
||||
use utils::serde_percent::Percent;
|
||||
|
||||
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -146,7 +145,7 @@ pub struct PageServerConf {
|
||||
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
|
||||
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||
|
||||
pub test_remote_failures: u64,
|
||||
|
||||
@@ -460,16 +459,7 @@ impl PageServerConf {
|
||||
metric_collection_endpoint,
|
||||
metric_collection_bucket,
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction: Some(disk_usage_based_eviction.unwrap_or(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: Default::default(),
|
||||
},
|
||||
)),
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
@@ -719,8 +709,9 @@ mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::config::{DiskUsageEvictionTaskConfig, EvictionOrder};
|
||||
use rstest::rstest;
|
||||
use utils::id::NodeId;
|
||||
use utils::{id::NodeId, serde_percent::Percent};
|
||||
|
||||
use super::PageServerConf;
|
||||
|
||||
@@ -820,19 +811,69 @@ mod tests {
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_disk_usage_based_eviction_is_valid() {
|
||||
let input = r#"
|
||||
#[rstest]
|
||||
#[
|
||||
case::omit_the_whole_config(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
eviction_order: Default::default(),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
enabled: true,
|
||||
},
|
||||
r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
"#;
|
||||
"#,
|
||||
)]
|
||||
#[
|
||||
case::omit_enabled_field(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 1_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
eviction_order: EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first: true,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
enabled: true,
|
||||
},
|
||||
r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
disk_usage_based_eviction = { max_usage_pct = 80, min_avail_bytes = 1000000000, period = "60s" }
|
||||
"#,
|
||||
)]
|
||||
#[case::disabled(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
eviction_order: EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first: true,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
enabled: false,
|
||||
},
|
||||
r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
disk_usage_based_eviction = { enabled = false }
|
||||
"#
|
||||
)]
|
||||
fn test_config_disk_usage_based_eviction_is_valid(
|
||||
#[case] expected_disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||
#[case] input: &str,
|
||||
) {
|
||||
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
|
||||
.expect("disk_usage_based_eviction is valid");
|
||||
let workdir = Utf8PathBuf::from("/nonexistent");
|
||||
let config = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir).unwrap();
|
||||
let disk_usage_based_eviction = config.disk_usage_based_eviction.unwrap();
|
||||
assert_eq!(disk_usage_based_eviction.max_usage_pct.get(), 80);
|
||||
assert_eq!(disk_usage_based_eviction.min_avail_bytes, 2_000_000_000);
|
||||
assert_eq!(disk_usage_based_eviction.period, Duration::from_secs(60));
|
||||
assert_eq!(disk_usage_based_eviction.eviction_order, Default::default());
|
||||
let disk_usage_based_eviction = config.disk_usage_based_eviction;
|
||||
assert_eq!(
|
||||
expected_disk_usage_based_eviction,
|
||||
disk_usage_based_eviction
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -171,7 +171,8 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
background_jobs_barrier: completion::Barrier,
|
||||
) -> Option<DiskUsageEvictionTask> {
|
||||
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
||||
let task_config = &conf.disk_usage_based_eviction;
|
||||
if !task_config.enabled {
|
||||
info!("disk usage based eviction task not configured");
|
||||
return None;
|
||||
};
|
||||
@@ -458,6 +459,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
match next {
|
||||
Ok(Ok(file_size)) => {
|
||||
METRICS.layers_evicted.inc();
|
||||
/*BEGIN_HADRON */
|
||||
METRICS.bytes_evicted.inc_by(file_size);
|
||||
/*END_HADRON */
|
||||
usage_assumed.add_available_bytes(file_size);
|
||||
}
|
||||
Ok(Err((
|
||||
@@ -1265,6 +1269,7 @@ mod filesystem_level_usage {
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: pageserver_api::config::EvictionOrder::default(),
|
||||
enabled: true,
|
||||
},
|
||||
total_bytes: 100_000,
|
||||
avail_bytes: 0,
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, atomic::AtomicBool},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use pageserver_api::config::NodeMetadata;
|
||||
@@ -355,11 +359,17 @@ impl PerTenantProperties {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TenantFeatureResolver {
|
||||
inner: FeatureResolver,
|
||||
tenant_id: TenantId,
|
||||
cached_tenant_properties: Arc<ArcSwap<HashMap<String, PostHogFlagFilterPropertyValue>>>,
|
||||
cached_tenant_properties: ArcSwap<HashMap<String, PostHogFlagFilterPropertyValue>>,
|
||||
|
||||
// Add feature flag on the critical path below.
|
||||
//
|
||||
// If a feature flag will be used on the critical path, we will update it in the tenant housekeeping loop insetad of
|
||||
// resolving directly by calling `evaluate_multivariate` or `evaluate_boolean`. Remember to update the flag in the
|
||||
// housekeeping loop. The user should directly read this atomic flag instead of using the set of evaluate functions.
|
||||
pub feature_test_remote_size_flag: AtomicBool,
|
||||
}
|
||||
|
||||
impl TenantFeatureResolver {
|
||||
@@ -367,7 +377,8 @@ impl TenantFeatureResolver {
|
||||
Self {
|
||||
inner,
|
||||
tenant_id,
|
||||
cached_tenant_properties: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
|
||||
cached_tenant_properties: ArcSwap::new(Arc::new(HashMap::new())),
|
||||
feature_test_remote_size_flag: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -396,12 +407,14 @@ impl TenantFeatureResolver {
|
||||
self.inner.is_feature_flag_boolean(flag_key)
|
||||
}
|
||||
|
||||
pub fn update_cached_tenant_properties(&self, tenant_shard: &TenantShard) {
|
||||
let mut remote_size_mb = None;
|
||||
/// Refresh the cached properties and flags on the critical path.
|
||||
pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) {
|
||||
let mut remote_size_mb = Some(0.0);
|
||||
for timeline in tenant_shard.list_timelines() {
|
||||
let size = timeline.metrics.resident_physical_size_get();
|
||||
if size == 0 {
|
||||
remote_size_mb = None;
|
||||
break;
|
||||
}
|
||||
if let Some(ref mut remote_size_mb) = remote_size_mb {
|
||||
*remote_size_mb += size as f64 / 1024.0 / 1024.0;
|
||||
@@ -410,5 +423,12 @@ impl TenantFeatureResolver {
|
||||
self.cached_tenant_properties.store(Arc::new(
|
||||
PerTenantProperties { remote_size_mb }.into_posthog_properties(),
|
||||
));
|
||||
|
||||
// BEGIN: Update the feature flag on the critical path.
|
||||
self.feature_test_remote_size_flag.store(
|
||||
self.evaluate_boolean("test-remote-size-flag").is_ok(),
|
||||
std::sync::atomic::Ordering::Relaxed,
|
||||
);
|
||||
// END: Update the feature flag on the critical path.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,26 +116,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
get:
|
||||
description: Get timelines for tenant
|
||||
responses:
|
||||
"200":
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -618,7 +598,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
@@ -685,6 +665,17 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
get:
|
||||
description: Get timelines for tenant
|
||||
responses:
|
||||
"200":
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
|
||||
parameters:
|
||||
@@ -767,7 +758,7 @@ paths:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/:
|
||||
/v1/tenant:
|
||||
get:
|
||||
description: Get tenants list
|
||||
responses:
|
||||
@@ -847,7 +838,7 @@ paths:
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
|
||||
/v1/tenant/{tenant_id}/config/:
|
||||
/v1/tenant/{tenant_id}/config:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
|
||||
@@ -61,6 +61,7 @@ use crate::context;
|
||||
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::feature_resolver::FeatureResolver;
|
||||
use crate::metrics::LOCAL_DATA_LOSS_SUSPECTED;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationConf;
|
||||
@@ -2501,10 +2502,7 @@ async fn timeline_checkpoint_handler(
|
||||
.map_err(|e|
|
||||
match e {
|
||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||
CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
CompactionError::Other(e) => ApiError::InternalServerError(e),
|
||||
CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
}
|
||||
)?;
|
||||
}
|
||||
@@ -3630,6 +3628,17 @@ async fn activate_post_import_handler(
|
||||
.await
|
||||
}
|
||||
|
||||
// [Hadron] Reset gauge metrics that are used to raised alerts. We need this API as a stop-gap measure to reset alerts
|
||||
// after we manually rectify situations such as local SSD data loss. We will eventually automate this.
|
||||
async fn hadron_reset_alert_gauges(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
LOCAL_DATA_LOSS_SUSPECTED.set(0);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Read the end of a tar archive.
|
||||
///
|
||||
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
|
||||
@@ -3682,6 +3691,23 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn force_refresh_feature_flag(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant
|
||||
.feature_resolver
|
||||
.refresh_properties_and_flags(&tenant);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_evaluate_feature_flag(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -3698,7 +3724,7 @@ async fn tenant_evaluate_feature_flag(
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
// TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
|
||||
// TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
|
||||
// and we don't need to worry about it for now.
|
||||
let properties = tenant.feature_resolver.collect_properties();
|
||||
if as_type.as_deref() == Some("boolean") {
|
||||
@@ -4147,6 +4173,9 @@ pub fn make_router(
|
||||
.get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| {
|
||||
api_handler(r, tenant_evaluate_feature_flag)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/force_refresh_feature_flag", |r| {
|
||||
api_handler(r, force_refresh_feature_flag)
|
||||
})
|
||||
.put("/v1/feature_flag/:flag_key", |r| {
|
||||
testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put)
|
||||
})
|
||||
@@ -4156,5 +4185,8 @@ pub fn make_router(
|
||||
.post("/v1/feature_flag_spec", |r| {
|
||||
api_handler(r, update_feature_flag_spec)
|
||||
})
|
||||
.post("/hadron-internal/reset_alert_gauges", |r| {
|
||||
api_handler(r, hadron_reset_alert_gauges)
|
||||
})
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::cell::Cell;
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::RawFd;
|
||||
@@ -102,7 +103,18 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Buckets for background operation duration in seconds, like compaction, GC, size calculation.
|
||||
/* BEGIN_HADRON */
|
||||
pub(crate) static STORAGE_ACTIVE_COUNT_PER_TIMELINE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_active_storage_operations_count",
|
||||
"Count of active storage operations with operation, tenant and timeline dimensions",
|
||||
&["operation", "tenant_id", "shard_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
/*END_HADRON */
|
||||
|
||||
// Buckets for background operations like compaction, GC, size calculation
|
||||
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
|
||||
|
||||
pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
@@ -2810,6 +2822,31 @@ pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
|
||||
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
|
||||
|
||||
pub(crate) static LOCAL_DATA_LOSS_SUSPECTED: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"pageserver_local_data_loss_suspected",
|
||||
"Non-zero value indicates that pageserver local data loss is suspected (and highly likely)."
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Counter keeping track of misrouted PageStream requests. Spelling out PageStream requests here to distinguish
|
||||
// it from other types of reqeusts (SK wal replication, http requests, etc.). PageStream requests are used by
|
||||
// Postgres compute to fetch data from pageservers.
|
||||
// A misrouted PageStream request is registered if the pageserver cannot find the tenant identified in the
|
||||
// request, or if the pageserver is not the "primary" serving the tenant shard. These error almost always identify
|
||||
// issues with compute configuration, caused by either the compute node itself being stuck in the wrong
|
||||
// configuration or Storage Controller reconciliation bugs. Misrouted requests are expected during tenant migration
|
||||
// and/or during recovery following a pageserver failure, but persistently high rates of misrouted requests
|
||||
// are indicative of bugs (and unavailability).
|
||||
pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_misrouted_pagestream_requests_total",
|
||||
"Number of pageserver pagestream requests that were routed to the wrong pageserver"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Metrics collected on WAL redo operations
|
||||
//
|
||||
// We collect the time spent in actual WAL redo ('redo'), and time waiting
|
||||
@@ -3048,13 +3085,19 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
||||
pub(crate) struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
start: Instant,
|
||||
stopped: Cell<bool>,
|
||||
}
|
||||
|
||||
impl StorageTimeMetricsTimer {
|
||||
fn new(metrics: StorageTimeMetrics) -> Self {
|
||||
/*BEGIN_HADRON */
|
||||
// record the active operation as the timer starts
|
||||
metrics.timeline_active_count.inc();
|
||||
/*END_HADRON */
|
||||
Self {
|
||||
metrics,
|
||||
start: Instant::now(),
|
||||
stopped: Cell::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3070,6 +3113,10 @@ impl StorageTimeMetricsTimer {
|
||||
self.metrics.timeline_sum.inc_by(seconds);
|
||||
self.metrics.timeline_count.inc();
|
||||
self.metrics.global_histogram.observe(seconds);
|
||||
/* BEGIN_HADRON*/
|
||||
self.stopped.set(true);
|
||||
self.metrics.timeline_active_count.dec();
|
||||
/*END_HADRON */
|
||||
duration
|
||||
}
|
||||
|
||||
@@ -3080,6 +3127,16 @@ impl StorageTimeMetricsTimer {
|
||||
}
|
||||
}
|
||||
|
||||
/*BEGIN_HADRON */
|
||||
impl Drop for StorageTimeMetricsTimer {
|
||||
fn drop(&mut self) {
|
||||
if !self.stopped.get() {
|
||||
self.metrics.timeline_active_count.dec();
|
||||
}
|
||||
}
|
||||
}
|
||||
/*END_HADRON */
|
||||
|
||||
pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
|
||||
|
||||
impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
|
||||
@@ -3105,6 +3162,10 @@ pub(crate) struct StorageTimeMetrics {
|
||||
timeline_sum: Counter,
|
||||
/// Number of oeprations, per operation, tenant_id and timeline_id
|
||||
timeline_count: IntCounter,
|
||||
/*BEGIN_HADRON */
|
||||
/// Number of active operations per operation, tenant_id, and timeline_id
|
||||
timeline_active_count: IntGauge,
|
||||
/*END_HADRON */
|
||||
/// Global histogram having only the "operation" label.
|
||||
global_histogram: Histogram,
|
||||
}
|
||||
@@ -3124,6 +3185,11 @@ impl StorageTimeMetrics {
|
||||
let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
|
||||
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
|
||||
.unwrap();
|
||||
/*BEGIN_HADRON */
|
||||
let timeline_active_count = STORAGE_ACTIVE_COUNT_PER_TIMELINE
|
||||
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
|
||||
.unwrap();
|
||||
/*END_HADRON */
|
||||
let global_histogram = STORAGE_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[operation])
|
||||
.unwrap();
|
||||
@@ -3131,6 +3197,7 @@ impl StorageTimeMetrics {
|
||||
StorageTimeMetrics {
|
||||
timeline_sum,
|
||||
timeline_count,
|
||||
timeline_active_count,
|
||||
global_histogram,
|
||||
}
|
||||
}
|
||||
@@ -3544,6 +3611,14 @@ impl TimelineMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
/* BEGIN_HADRON */
|
||||
let _ = STORAGE_ACTIVE_COUNT_PER_TIMELINE.remove_label_values(&[
|
||||
op,
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
/*END_HADRON */
|
||||
}
|
||||
|
||||
for op in StorageIoSizeOperation::VARIANTS {
|
||||
@@ -4336,6 +4411,9 @@ pub(crate) mod disk_usage_based_eviction {
|
||||
pub(crate) layers_collected: IntCounter,
|
||||
pub(crate) layers_selected: IntCounter,
|
||||
pub(crate) layers_evicted: IntCounter,
|
||||
/*BEGIN_HADRON */
|
||||
pub(crate) bytes_evicted: IntCounter,
|
||||
/*END_HADRON */
|
||||
}
|
||||
|
||||
impl Default for Metrics {
|
||||
@@ -4372,12 +4450,21 @@ pub(crate) mod disk_usage_based_eviction {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
/*BEGIN_HADRON */
|
||||
let bytes_evicted = register_int_counter!(
|
||||
"pageserver_disk_usage_based_eviction_evicted_bytes_total",
|
||||
"Amount of bytes successfully evicted"
|
||||
)
|
||||
.unwrap();
|
||||
/*END_HADRON */
|
||||
|
||||
Self {
|
||||
tenant_collection_time,
|
||||
tenant_layer_count,
|
||||
layers_collected,
|
||||
layers_selected,
|
||||
layers_evicted,
|
||||
bytes_evicted,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4497,6 +4584,7 @@ pub fn preinitialize_metrics(
|
||||
&CIRCUIT_BREAKERS_UNBROKEN,
|
||||
&PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
|
||||
&WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS,
|
||||
&MISROUTED_PAGESTREAM_REQUESTS,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
@@ -4534,6 +4622,7 @@ pub fn preinitialize_metrics(
|
||||
|
||||
// gauges
|
||||
WALRECEIVER_ACTIVE_MANAGERS.get();
|
||||
LOCAL_DATA_LOSS_SUSPECTED.get();
|
||||
|
||||
// histograms
|
||||
[
|
||||
|
||||
@@ -70,7 +70,7 @@ use crate::context::{
|
||||
};
|
||||
use crate::metrics::{
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
|
||||
SmgrOpTimer, TimelineMetrics,
|
||||
MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics,
|
||||
};
|
||||
use crate::pgdatadir_mapping::{LsnRange, Version};
|
||||
use crate::span::{
|
||||
@@ -91,7 +91,8 @@ use crate::{CancellableTask, PERF_TRACE_TARGET, timed_after_cancellation};
|
||||
/// is not yet in state [`TenantState::Active`].
|
||||
///
|
||||
/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
/// HADRON: reduced timeout and we will retry in Cache::get().
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
|
||||
/// Threshold at which to log slow GetPage requests.
|
||||
const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
|
||||
@@ -1128,6 +1129,7 @@ impl PageServerHandler {
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
MISROUTED_PAGESTREAM_REQUESTS.inc();
|
||||
return respond_error!(
|
||||
span,
|
||||
PageStreamError::Reconnect(
|
||||
@@ -3351,6 +3353,8 @@ impl GrpcPageServiceHandler {
|
||||
/// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
|
||||
/// GetPageResponse with an appropriate status code to avoid terminating the stream.
|
||||
///
|
||||
/// TODO: verify that the requested pages belong to this shard.
|
||||
///
|
||||
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
|
||||
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
|
||||
/// split them up in the client or server.
|
||||
|
||||
@@ -141,6 +141,23 @@ pub(crate) enum CollectKeySpaceError {
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl CollectKeySpaceError {
|
||||
pub(crate) fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
CollectKeySpaceError::Decode(_) => false,
|
||||
CollectKeySpaceError::PageRead(e) => e.is_cancel(),
|
||||
CollectKeySpaceError::Cancelled => true,
|
||||
}
|
||||
}
|
||||
pub(crate) fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
CollectKeySpaceError::Decode(e) => anyhow::Error::new(e),
|
||||
CollectKeySpaceError::PageRead(e) => anyhow::Error::new(e),
|
||||
CollectKeySpaceError::Cancelled => anyhow::Error::new(self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for CollectKeySpaceError {
|
||||
fn from(err: PageReconstructError) -> Self {
|
||||
match err {
|
||||
|
||||
@@ -142,6 +142,9 @@ mod gc_block;
|
||||
mod gc_result;
|
||||
pub(crate) mod throttle;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod debug;
|
||||
|
||||
pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
|
||||
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -388,7 +391,7 @@ pub struct TenantShard {
|
||||
|
||||
l0_flush_global_state: L0FlushGlobalState,
|
||||
|
||||
pub(crate) feature_resolver: TenantFeatureResolver,
|
||||
pub(crate) feature_resolver: Arc<TenantFeatureResolver>,
|
||||
}
|
||||
impl std::fmt::Debug for TenantShard {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
@@ -3288,7 +3291,9 @@ impl TenantShard {
|
||||
// Ignore this, we likely raced with unarchival.
|
||||
OffloadError::NotArchived => Ok(()),
|
||||
OffloadError::AlreadyInProgress => Ok(()),
|
||||
err => Err(err),
|
||||
OffloadError::Cancelled => Err(CompactionError::ShuttingDown),
|
||||
// don't break the anyhow chain
|
||||
OffloadError::Other(err) => Err(CompactionError::Other(err)),
|
||||
})?;
|
||||
}
|
||||
|
||||
@@ -3319,23 +3324,12 @@ impl TenantShard {
|
||||
match err {
|
||||
err if err.is_cancel() => {}
|
||||
CompactionError::ShuttingDown => (),
|
||||
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
|
||||
// shouldn't block compaction.
|
||||
CompactionError::Offload(_) => {}
|
||||
CompactionError::CollectKeySpaceError(err) => {
|
||||
// CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch.
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
CompactionError::Other(err) => {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
CompactionError::AlreadyRunning(_) => {}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3413,7 +3407,7 @@ impl TenantShard {
|
||||
}
|
||||
|
||||
// Update the feature resolver with the latest tenant-spcific data.
|
||||
self.feature_resolver.update_cached_tenant_properties(self);
|
||||
self.feature_resolver.refresh_properties_and_flags(self);
|
||||
}
|
||||
|
||||
pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
|
||||
@@ -4502,10 +4496,10 @@ impl TenantShard {
|
||||
gc_block: Default::default(),
|
||||
l0_flush_global_state,
|
||||
basebackup_cache,
|
||||
feature_resolver: TenantFeatureResolver::new(
|
||||
feature_resolver: Arc::new(TenantFeatureResolver::new(
|
||||
feature_resolver,
|
||||
tenant_shard_id.tenant_id,
|
||||
),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6017,12 +6011,11 @@ pub(crate) mod harness {
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) async fn do_try_load(
|
||||
pub(crate) async fn do_try_load_with_redo(
|
||||
&self,
|
||||
walredo_mgr: Arc<WalRedoManager>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<TenantShard>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);
|
||||
|
||||
let tenant = Arc::new(TenantShard::new(
|
||||
@@ -6060,6 +6053,14 @@ pub(crate) mod harness {
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
pub(crate) async fn do_try_load(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<TenantShard>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
self.do_try_load_with_redo(walredo_mgr, ctx).await
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
|
||||
self.conf.timeline_path(&self.tenant_shard_id, timeline_id)
|
||||
}
|
||||
|
||||
366
pageserver/src/tenant/debug.rs
Normal file
366
pageserver/src/tenant/debug.rs
Normal file
@@ -0,0 +1,366 @@
|
||||
use std::{ops::Range, str::FromStr, sync::Arc};
|
||||
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use base64::{Engine as _, engine::general_purpose::STANDARD};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
keyspace::KeySpace,
|
||||
shard::{ShardIdentity, ShardStripeSize},
|
||||
};
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn};
|
||||
use tracing::Instrument;
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardIndex, ShardNumber},
|
||||
};
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::ValueReconstructState,
|
||||
walredo::harness::RedoHarness,
|
||||
};
|
||||
|
||||
use super::{
|
||||
WalRedoManager, WalredoManagerId,
|
||||
harness::TenantHarness,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
storage_layer::{AsLayerDesc, IoConcurrency, Layer, LayerName, ValuesReconstructState},
|
||||
};
|
||||
|
||||
fn process_page_image(next_record_lsn: Lsn, is_fpw: bool, img_bytes: Bytes) -> Bytes {
|
||||
// To match the logic in libs/wal_decoder/src/serialized_batch.rs
|
||||
let mut new_image: BytesMut = img_bytes.into();
|
||||
if is_fpw && !page_is_new(&new_image) {
|
||||
page_set_lsn(&mut new_image, next_record_lsn);
|
||||
}
|
||||
assert_eq!(new_image.len(), BLCKSZ as usize);
|
||||
new_image.freeze()
|
||||
}
|
||||
|
||||
async fn redo_wals(input: &str, key: Key) -> anyhow::Result<()> {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let redo_harness = RedoHarness::new()?;
|
||||
let span = redo_harness.span();
|
||||
let tenant_conf = pageserver_api::models::TenantConfig {
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
let tenant = TenantHarness::create_custom(
|
||||
"search_key",
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
ShardIdentity::unsharded(),
|
||||
Generation::new(1),
|
||||
)
|
||||
.await?
|
||||
.do_try_load_with_redo(
|
||||
Arc::new(WalRedoManager::Prod(
|
||||
WalredoManagerId::next(),
|
||||
redo_harness.manager,
|
||||
)),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx)
|
||||
.await?;
|
||||
let contents = tokio::fs::read_to_string(input)
|
||||
.await
|
||||
.map_err(|e| anyhow::Error::msg(format!("Failed to read input file {input}: {e}")))
|
||||
.unwrap();
|
||||
let lines = contents.lines();
|
||||
let mut last_wal_lsn: Option<Lsn> = None;
|
||||
let state = {
|
||||
let mut state = ValueReconstructState::default();
|
||||
let mut is_fpw = false;
|
||||
let mut is_first_line = true;
|
||||
for line in lines {
|
||||
if is_first_line {
|
||||
is_first_line = false;
|
||||
if line.trim() == "FPW" {
|
||||
is_fpw = true;
|
||||
}
|
||||
continue; // Skip the first line.
|
||||
}
|
||||
// Each input line is in the "<next_record_lsn>,<base64>" format.
|
||||
let (lsn_str, payload_b64) = line
|
||||
.split_once(',')
|
||||
.expect("Invalid input format: expected '<lsn>,<base64>'");
|
||||
|
||||
// Parse the LSN and decode the payload.
|
||||
let lsn = Lsn::from_str(lsn_str.trim()).expect("Invalid LSN format");
|
||||
let bytes = Bytes::from(
|
||||
STANDARD
|
||||
.decode(payload_b64.trim())
|
||||
.expect("Invalid base64 payload"),
|
||||
);
|
||||
|
||||
// The first line is considered the base image, the rest are WAL records.
|
||||
if state.img.is_none() {
|
||||
state.img = Some((lsn, process_page_image(lsn, is_fpw, bytes)));
|
||||
} else {
|
||||
let wal_record = NeonWalRecord::Postgres {
|
||||
will_init: false,
|
||||
rec: bytes,
|
||||
};
|
||||
state.records.push((lsn, wal_record));
|
||||
last_wal_lsn.replace(lsn);
|
||||
}
|
||||
}
|
||||
state
|
||||
};
|
||||
|
||||
assert!(state.img.is_some(), "No base image found");
|
||||
assert!(!state.records.is_empty(), "No WAL records found");
|
||||
let result = timeline
|
||||
.reconstruct_value(key, last_wal_lsn.unwrap(), state, RedoAttemptType::ReadPage)
|
||||
.instrument(span.clone())
|
||||
.await?;
|
||||
|
||||
eprintln!("final image: {:?}", STANDARD.encode(result));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn search_key(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
dir: String,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let shard_index = ShardIndex {
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(4),
|
||||
};
|
||||
|
||||
let redo_harness = RedoHarness::new()?;
|
||||
let span = redo_harness.span();
|
||||
let tenant_conf = pageserver_api::models::TenantConfig {
|
||||
..Default::default()
|
||||
};
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
let tenant = TenantHarness::create_custom(
|
||||
"search_key",
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
ShardIdentity::new(
|
||||
shard_index.shard_number,
|
||||
shard_index.shard_count,
|
||||
ShardStripeSize(32768),
|
||||
)
|
||||
.unwrap(),
|
||||
Generation::new(1),
|
||||
)
|
||||
.await?
|
||||
.do_try_load_with_redo(
|
||||
Arc::new(WalRedoManager::Prod(
|
||||
WalredoManagerId::next(),
|
||||
redo_harness.manager,
|
||||
)),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx)
|
||||
.await?;
|
||||
|
||||
let mut delta_layers: Vec<Layer> = Vec::new();
|
||||
let mut img_layer: Option<Layer> = Option::None;
|
||||
let mut dir = tokio::fs::read_dir(dir).await?;
|
||||
loop {
|
||||
let entry = dir.next_entry().await?;
|
||||
if entry.is_none() || !entry.as_ref().unwrap().file_type().await?.is_file() {
|
||||
break;
|
||||
}
|
||||
let path = Utf8PathBuf::from_path_buf(entry.unwrap().path()).unwrap();
|
||||
let layer_name = match LayerName::from_str(path.file_name().unwrap()) {
|
||||
Ok(name) => name,
|
||||
Err(_) => {
|
||||
eprintln!("Skipped invalid layer: {path}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let layer = Layer::for_resident(
|
||||
tenant.conf,
|
||||
&timeline,
|
||||
path.clone(),
|
||||
layer_name,
|
||||
LayerFileMetadata::new(
|
||||
tokio::fs::metadata(path.clone()).await?.len(),
|
||||
Generation::new(1),
|
||||
shard_index,
|
||||
),
|
||||
);
|
||||
if layer.layer_desc().is_delta() {
|
||||
delta_layers.push(layer.into());
|
||||
} else if img_layer.is_none() {
|
||||
img_layer = Some(layer.into());
|
||||
} else {
|
||||
anyhow::bail!("Found multiple image layers");
|
||||
}
|
||||
}
|
||||
// sort delta layers based on the descending order of LSN
|
||||
delta_layers.sort_by(|a, b| {
|
||||
b.layer_desc()
|
||||
.get_lsn_range()
|
||||
.start
|
||||
.cmp(&a.layer_desc().get_lsn_range().start)
|
||||
});
|
||||
|
||||
let mut state = ValuesReconstructState::new(IoConcurrency::Sequential);
|
||||
|
||||
let key_space = KeySpace::single(Range {
|
||||
start: key,
|
||||
end: key.next(),
|
||||
});
|
||||
let lsn_range = Range {
|
||||
start: img_layer
|
||||
.as_ref()
|
||||
.map_or(Lsn(0x00), |img| img.layer_desc().image_layer_lsn()),
|
||||
end: lsn,
|
||||
};
|
||||
for delta_layer in delta_layers.iter() {
|
||||
delta_layer
|
||||
.get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
img_layer
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx)
|
||||
.await?;
|
||||
|
||||
for (_key, result) in std::mem::take(&mut state.keys) {
|
||||
let state = result.collect_pending_ios().await?;
|
||||
if state.img.is_some() {
|
||||
eprintln!(
|
||||
"image: {}: {:x?}",
|
||||
state.img.as_ref().unwrap().0,
|
||||
STANDARD.encode(state.img.as_ref().unwrap().1.clone())
|
||||
);
|
||||
}
|
||||
for delta in state.records.iter() {
|
||||
match &delta.1 {
|
||||
NeonWalRecord::Postgres { will_init, rec } => {
|
||||
eprintln!(
|
||||
"delta: {}: will_init: {}, {:x?}",
|
||||
delta.0,
|
||||
will_init,
|
||||
STANDARD.encode(rec)
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
eprintln!("delta: {}: {:x?}", delta.0, delta.1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let result = timeline
|
||||
.reconstruct_value(key, lsn_range.end, state, RedoAttemptType::ReadPage)
|
||||
.instrument(span.clone())
|
||||
.await?;
|
||||
eprintln!("final image: {lsn} : {result:?}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Redo all WALs against the base image in the input file. Return the base64 encoded final image.
|
||||
/// Each line in the input file must be in the form "<lsn>,<base64>" where:
|
||||
/// * `<lsn>` is a PostgreSQL LSN in hexadecimal notation, e.g. `0/16ABCDE`.
|
||||
/// * `<base64>` is the base64‐encoded page image (first line) or WAL record (subsequent lines).
|
||||
///
|
||||
/// The first line provides the base image of a page. The LSN is the LSN of "next record" following
|
||||
/// the record containing the FPI. For example, if the FPI was extracted from a WAL record occuping
|
||||
/// [0/1, 0/200) in the WAL stream, the LSN appearing along side the page image here should be 0/200.
|
||||
///
|
||||
/// The subsequent lines are WAL records, ordered from the oldest to the newest. The LSN is the
|
||||
/// record LSN of the WAL record, not the "next record" LSN. For example, if the WAL record here
|
||||
/// occupies [0/1, 0/200) in the WAL stream, the LSN appearing along side the WAL record here should
|
||||
/// be 0/1.
|
||||
#[derive(Parser)]
|
||||
struct RedoWalsCmd {
|
||||
#[clap(long)]
|
||||
input: String,
|
||||
#[clap(long)]
|
||||
key: String,
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_redo_wals() -> anyhow::Result<()> {
|
||||
let args = std::env::args().collect_vec();
|
||||
let pos = args
|
||||
.iter()
|
||||
.position(|arg| arg == "--")
|
||||
.unwrap_or(args.len());
|
||||
let slice = &args[pos..args.len()];
|
||||
let cmd = match RedoWalsCmd::try_parse_from(slice) {
|
||||
Ok(cmd) => cmd,
|
||||
Err(err) => {
|
||||
eprintln!("{err}");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let key = Key::from_hex(&cmd.key).unwrap();
|
||||
redo_wals(&cmd.input, key).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Search for a page at the given LSN in all layers of the data_dir.
|
||||
/// Return the base64-encoded image and all WAL records, as well as the final reconstructed image.
|
||||
#[derive(Parser)]
|
||||
struct SearchKeyCmd {
|
||||
#[clap(long)]
|
||||
tenant_id: String,
|
||||
#[clap(long)]
|
||||
timeline_id: String,
|
||||
#[clap(long)]
|
||||
data_dir: String,
|
||||
#[clap(long)]
|
||||
key: String,
|
||||
#[clap(long)]
|
||||
lsn: String,
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_search_key() -> anyhow::Result<()> {
|
||||
let args = std::env::args().collect_vec();
|
||||
let pos = args
|
||||
.iter()
|
||||
.position(|arg| arg == "--")
|
||||
.unwrap_or(args.len());
|
||||
let slice = &args[pos..args.len()];
|
||||
let cmd = match SearchKeyCmd::try_parse_from(slice) {
|
||||
Ok(cmd) => cmd,
|
||||
Err(err) => {
|
||||
eprintln!("{err}");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let tenant_id = TenantId::from_str(&cmd.tenant_id).unwrap();
|
||||
let timeline_id = TimelineId::from_str(&cmd.timeline_id).unwrap();
|
||||
let key = Key::from_hex(&cmd.key).unwrap();
|
||||
let lsn = Lsn::from_str(&cmd.lsn).unwrap();
|
||||
search_key(tenant_id, timeline_id, cmd.data_dir, key, lsn).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -43,7 +43,7 @@ use crate::controller_upcall_client::{
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
|
||||
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::metrics::{LOCAL_DATA_LOSS_SUSPECTED, TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
|
||||
@@ -538,6 +538,21 @@ pub async fn init_tenant_mgr(
|
||||
// Determine which tenants are to be secondary or attached, and in which generation
|
||||
let tenant_modes = init_load_generations(conf, &tenant_configs, resources, cancel).await?;
|
||||
|
||||
// Hadron local SSD check: Raise an alert if our local filesystem does not contain any tenants but the re-attach request returned tenants.
|
||||
// This can happen if the PS suffered a Kubernetes node failure resulting in loss of all local data, but recovered quickly on another node
|
||||
// so the Storage Controller has not had the time to move tenants out.
|
||||
let data_loss_suspected = if let Some(tenant_modes) = &tenant_modes {
|
||||
tenant_configs.is_empty() && !tenant_modes.is_empty()
|
||||
} else {
|
||||
false
|
||||
};
|
||||
if data_loss_suspected {
|
||||
tracing::error!(
|
||||
"Local data loss suspected: no tenants found on local filesystem, but re-attach request returned tenants"
|
||||
);
|
||||
}
|
||||
LOCAL_DATA_LOSS_SUSPECTED.set(if data_loss_suspected { 1 } else { 0 });
|
||||
|
||||
tracing::info!(
|
||||
"Attaching {} tenants at startup, warming up {} at a time",
|
||||
tenant_configs.len(),
|
||||
|
||||
@@ -141,11 +141,29 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
|
||||
let fs_size = usize::try_from(fs_size)
|
||||
.with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?;
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let mut metadata = None;
|
||||
match storage {
|
||||
// Pass the file path as a storage metadata to minimize changes to neon.
|
||||
// Otherwise, we need to change the upload interface.
|
||||
GenericRemoteStorage::AzureBlob(s) => {
|
||||
let block_size_mb = s.put_block_size_mb.unwrap_or(0);
|
||||
if block_size_mb > 0 && fs_size > block_size_mb * 1024 * 1024 {
|
||||
metadata = Some(remote_storage::StorageMetadata::from([(
|
||||
"databricks_azure_put_block",
|
||||
local_path.as_str(),
|
||||
)]));
|
||||
}
|
||||
}
|
||||
GenericRemoteStorage::LocalFs(_) => {}
|
||||
GenericRemoteStorage::AwsS3(_) => {}
|
||||
GenericRemoteStorage::Unreliable(_) => {}
|
||||
};
|
||||
/* END_HADRON */
|
||||
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||
|
||||
storage
|
||||
.upload(reader, fs_size, remote_path, None, cancel)
|
||||
.upload(reader, fs_size, remote_path, metadata, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload layer from local path '{local_path}'"))
|
||||
}
|
||||
|
||||
@@ -34,6 +34,21 @@ use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
|
||||
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
|
||||
static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
|
||||
let total_threads = TOKIO_WORKER_THREADS.get();
|
||||
|
||||
/*BEGIN_HADRON*/
|
||||
// ideally we should run at least one compaction task per tenant in order to (1) maximize
|
||||
// compaction throughput (2) avoid head-of-line blocking of large compactions. However doing
|
||||
// that may create too many compaction tasks with lots of memory overheads. So we limit the
|
||||
// number of compaction tasks based on the available CPU core count.
|
||||
// Need to revisit.
|
||||
// let tasks_per_thread = std::env::var("BG_TASKS_PER_THREAD")
|
||||
// .ok()
|
||||
// .and_then(|s| s.parse().ok())
|
||||
// .unwrap_or(4);
|
||||
// let permits = usize::max(1, total_threads * tasks_per_thread);
|
||||
// // assert!(permits < total_threads, "need threads for other work");
|
||||
/*END_HADRON*/
|
||||
|
||||
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(permits < total_threads, "need threads for other work");
|
||||
@@ -303,9 +318,6 @@ pub(crate) fn log_compaction_error(
|
||||
let level = match err {
|
||||
e if e.is_cancel() => return,
|
||||
ShuttingDown => return,
|
||||
Offload(_) => Level::ERROR,
|
||||
AlreadyRunning(_) => Level::ERROR,
|
||||
CollectKeySpaceError(_) => Level::ERROR,
|
||||
_ if task_cancelled => Level::INFO,
|
||||
Other(err) => {
|
||||
let root_cause = err.root_cause();
|
||||
@@ -315,7 +327,7 @@ pub(crate) fn log_compaction_error(
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let buffered_writer_flush_task_canelled = root_cause
|
||||
.downcast_ref::<FlushTaskError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
|
||||
@@ -40,7 +40,6 @@ use layer_manager::{
|
||||
Shutdown,
|
||||
};
|
||||
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
|
||||
use pageserver_api::key::{
|
||||
@@ -119,7 +118,6 @@ use crate::pgdatadir_mapping::{
|
||||
MAX_AUX_FILE_V2_DELTAS, MetricsUpdate,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::AttachmentMode;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
@@ -202,7 +200,7 @@ pub struct TimelineResources {
|
||||
pub l0_compaction_trigger: Arc<Notify>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
pub basebackup_cache: Arc<BasebackupCache>,
|
||||
pub feature_resolver: TenantFeatureResolver,
|
||||
pub feature_resolver: Arc<TenantFeatureResolver>,
|
||||
}
|
||||
|
||||
pub struct Timeline {
|
||||
@@ -450,7 +448,7 @@ pub struct Timeline {
|
||||
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
|
||||
feature_resolver: TenantFeatureResolver,
|
||||
feature_resolver: Arc<TenantFeatureResolver>,
|
||||
}
|
||||
|
||||
pub(crate) enum PreviousHeatmap {
|
||||
@@ -587,6 +585,28 @@ pub(crate) enum PageReconstructError {
|
||||
MissingKey(Box<MissingKeyError>),
|
||||
}
|
||||
|
||||
impl PageReconstructError {
|
||||
pub(crate) fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
PageReconstructError::Other(_) => false,
|
||||
PageReconstructError::AncestorLsnTimeout(e) => e.is_cancel(),
|
||||
PageReconstructError::Cancelled => true,
|
||||
PageReconstructError::WalRedo(_) => false,
|
||||
PageReconstructError::MissingKey(_) => false,
|
||||
}
|
||||
}
|
||||
#[allow(dead_code)] // we use the is_cancel + into_anyhow pattern in quite a few places, this one will follow soon enough
|
||||
pub(crate) fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
PageReconstructError::Other(e) => e,
|
||||
PageReconstructError::AncestorLsnTimeout(e) => e.into_anyhow(),
|
||||
PageReconstructError::Cancelled => anyhow::Error::new(self),
|
||||
PageReconstructError::WalRedo(e) => e,
|
||||
PageReconstructError::MissingKey(_) => anyhow::Error::new(self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for PageReconstructError {
|
||||
fn from(value: anyhow::Error) -> Self {
|
||||
// with walingest.rs many PageReconstructError are wrapped in as anyhow::Error
|
||||
@@ -740,17 +760,6 @@ impl std::fmt::Display for MissingKeyError {
|
||||
}
|
||||
}
|
||||
|
||||
impl PageReconstructError {
|
||||
/// Returns true if this error indicates a tenant/timeline shutdown alike situation
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
use PageReconstructError::*;
|
||||
match self {
|
||||
Cancelled => true,
|
||||
Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum CreateImageLayersError {
|
||||
#[error("timeline shutting down")]
|
||||
@@ -953,13 +962,35 @@ pub enum WaitLsnError {
|
||||
Timeout(String),
|
||||
}
|
||||
|
||||
impl WaitLsnError {
|
||||
pub(crate) fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
WaitLsnError::Shutdown => true,
|
||||
WaitLsnError::BadState(timeline_state) => match timeline_state {
|
||||
TimelineState::Loading => false,
|
||||
TimelineState::Active => false,
|
||||
TimelineState::Stopping => true,
|
||||
TimelineState::Broken { .. } => false,
|
||||
},
|
||||
WaitLsnError::Timeout(_) => false,
|
||||
}
|
||||
}
|
||||
pub(crate) fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
WaitLsnError::Shutdown => anyhow::Error::new(self),
|
||||
WaitLsnError::BadState(_) => anyhow::Error::new(self),
|
||||
WaitLsnError::Timeout(_) => anyhow::Error::new(self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<WaitLsnError> for tonic::Status {
|
||||
fn from(err: WaitLsnError) -> Self {
|
||||
use tonic::Code;
|
||||
let code = match &err {
|
||||
WaitLsnError::Timeout(_) => Code::Internal,
|
||||
WaitLsnError::BadState(_) => Code::Internal,
|
||||
WaitLsnError::Shutdown => Code::Unavailable,
|
||||
let code = if err.is_cancel() {
|
||||
Code::Unavailable
|
||||
} else {
|
||||
Code::Internal
|
||||
};
|
||||
tonic::Status::new(code, err.to_string())
|
||||
}
|
||||
@@ -1086,6 +1117,26 @@ enum ImageLayerCreationOutcome {
|
||||
Skip,
|
||||
}
|
||||
|
||||
enum RepartitionError {
|
||||
Other(anyhow::Error),
|
||||
CollectKeyspace(CollectKeySpaceError),
|
||||
}
|
||||
|
||||
impl RepartitionError {
|
||||
fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
RepartitionError::Other(_) => false,
|
||||
RepartitionError::CollectKeyspace(e) => e.is_cancel(),
|
||||
}
|
||||
}
|
||||
fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
RepartitionError::Other(e) => e,
|
||||
RepartitionError::CollectKeyspace(e) => e.into_anyhow(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
@@ -1772,30 +1823,31 @@ impl Timeline {
|
||||
existing_lease.clone()
|
||||
}
|
||||
Entry::Vacant(vacant) => {
|
||||
// Reject already GC-ed LSN if we are in AttachedSingle and
|
||||
// not blocked by the lsn lease deadline.
|
||||
// Never allow a lease to be requested for an LSN below the applied GC cutoff. The data could have been deleted.
|
||||
let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}",
|
||||
lsn,
|
||||
*latest_gc_cutoff_lsn
|
||||
);
|
||||
}
|
||||
|
||||
// We allow create lease for those below the planned gc cutoff if we are still within the grace period
|
||||
// of GC blocking.
|
||||
let validate = {
|
||||
let conf = self.tenant_conf.load();
|
||||
conf.location.attach_mode == AttachmentMode::Single
|
||||
&& !conf.is_gc_blocked_by_lsn_lease_deadline()
|
||||
!conf.is_gc_blocked_by_lsn_lease_deadline()
|
||||
};
|
||||
|
||||
if init || validate {
|
||||
let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}",
|
||||
lsn,
|
||||
*latest_gc_cutoff_lsn
|
||||
);
|
||||
}
|
||||
if lsn < planned_cutoff {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}",
|
||||
lsn,
|
||||
planned_cutoff
|
||||
);
|
||||
}
|
||||
// Do not allow initial lease creation to be below the planned gc cutoff. The client (compute_ctl) determines
|
||||
// whether it is a initial lease creation or a renewal.
|
||||
if (init || validate) && lsn < planned_cutoff {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}",
|
||||
lsn,
|
||||
planned_cutoff
|
||||
);
|
||||
}
|
||||
|
||||
let dt: DateTime<Utc> = valid_until.into();
|
||||
@@ -2068,19 +2120,9 @@ impl Timeline {
|
||||
Err(CompactionError::ShuttingDown) => {
|
||||
// Covered by the `Err(e) if e.is_cancel()` branch.
|
||||
}
|
||||
Err(CompactionError::AlreadyRunning(_)) => {
|
||||
// Covered by the `Err(e) if e.is_cancel()` branch.
|
||||
}
|
||||
Err(CompactionError::Other(_)) => {
|
||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||
}
|
||||
Err(CompactionError::CollectKeySpaceError(_)) => {
|
||||
// Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch.
|
||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||
}
|
||||
// Don't change the current value on offload failure or shutdown. We don't want to
|
||||
// abruptly stall nor resume L0 flushes in these cases.
|
||||
Err(CompactionError::Offload(_)) => {}
|
||||
};
|
||||
|
||||
result
|
||||
@@ -3129,7 +3171,7 @@ impl Timeline {
|
||||
|
||||
basebackup_cache: resources.basebackup_cache,
|
||||
|
||||
feature_resolver: resources.feature_resolver,
|
||||
feature_resolver: resources.feature_resolver.clone(),
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
@@ -4970,7 +5012,7 @@ impl Timeline {
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?;
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e.into_anyhow()))?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
@@ -5220,18 +5262,18 @@ impl Timeline {
|
||||
partition_size: u64,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), RepartitionError> {
|
||||
let Ok(mut guard) = self.partitioning.try_write_guard() else {
|
||||
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
|
||||
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
|
||||
// and hence before the compaction task starts.
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
return Err(RepartitionError::Other(anyhow!(
|
||||
"repartition() called concurrently"
|
||||
)));
|
||||
};
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
|
||||
if lsn < *partition_lsn {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
return Err(RepartitionError::Other(anyhow!(
|
||||
"repartition() called with LSN going backwards, this should not happen"
|
||||
)));
|
||||
}
|
||||
@@ -5252,7 +5294,10 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
|
||||
let (dense_ks, sparse_ks) = self
|
||||
.collect_keyspace(lsn, ctx)
|
||||
.await
|
||||
.map_err(RepartitionError::CollectKeyspace)?;
|
||||
let dense_partitioning = dense_ks.partition(
|
||||
&self.shard_identity,
|
||||
partition_size,
|
||||
@@ -6017,52 +6062,21 @@ impl Drop for Timeline {
|
||||
pub(crate) enum CompactionError {
|
||||
#[error("The timeline or pageserver is shutting down")]
|
||||
ShuttingDown,
|
||||
/// Compaction tried to offload a timeline and failed
|
||||
#[error("Failed to offload timeline: {0}")]
|
||||
Offload(OffloadError),
|
||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
||||
#[error("Failed to collect keyspace: {0}")]
|
||||
CollectKeySpaceError(#[from] CollectKeySpaceError),
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
#[error("Compaction already running: {0}")]
|
||||
AlreadyRunning(&'static str),
|
||||
}
|
||||
|
||||
impl CompactionError {
|
||||
/// Errors that can be ignored, i.e., cancel and shutdown.
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
matches!(self, Self::ShuttingDown)
|
||||
}
|
||||
|
||||
pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
|
||||
if err.is_cancel() {
|
||||
Self::ShuttingDown
|
||||
| Self::AlreadyRunning(_)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::Cancelled
|
||||
))
|
||||
| Self::Offload(OffloadError::Cancelled)
|
||||
)
|
||||
}
|
||||
|
||||
/// Critical errors that indicate data corruption.
|
||||
pub fn is_critical(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::CollectKeySpaceError(
|
||||
CollectKeySpaceError::Decode(_)
|
||||
| CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OffloadError> for CompactionError {
|
||||
fn from(e: OffloadError) -> Self {
|
||||
match e {
|
||||
OffloadError::Cancelled => Self::ShuttingDown,
|
||||
_ => Self::Offload(e),
|
||||
} else {
|
||||
Self::Other(err.into_anyhow())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6765,7 +6779,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Reconstruct a value, using the given base image and WAL records in 'data'.
|
||||
async fn reconstruct_value(
|
||||
pub(crate) async fn reconstruct_value(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
|
||||
@@ -16,7 +16,8 @@ use super::{
|
||||
Timeline,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::DeltaEntry;
|
||||
use crate::pgdatadir_mapping::CollectKeySpaceError;
|
||||
use crate::tenant::timeline::{DeltaEntry, RepartitionError};
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use anyhow::{Context, anyhow};
|
||||
use bytes::Bytes;
|
||||
@@ -64,7 +65,7 @@ use crate::tenant::timeline::{
|
||||
DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
|
||||
ResidentLayer, drop_layer_manager_rlock,
|
||||
};
|
||||
use crate::tenant::{DeltaLayer, MaybeOffloaded};
|
||||
use crate::tenant::{DeltaLayer, MaybeOffloaded, PageReconstructError};
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
|
||||
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
|
||||
@@ -572,7 +573,7 @@ impl GcCompactionQueue {
|
||||
match res {
|
||||
Ok(res) => Ok(res),
|
||||
Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
|
||||
Err(_) => {
|
||||
Err(CompactionError::Other(_)) => {
|
||||
// There are some cases where traditional gc might collect some layer
|
||||
// files causing gc-compaction cannot read the full history of the key.
|
||||
// This needs to be resolved in the long-term by improving the compaction
|
||||
@@ -591,9 +592,9 @@ impl GcCompactionQueue {
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else {
|
||||
return Err(CompactionError::AlreadyRunning(
|
||||
"cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.",
|
||||
));
|
||||
return Err(CompactionError::Other(anyhow::anyhow!(
|
||||
"cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue."
|
||||
)));
|
||||
};
|
||||
let has_pending_tasks;
|
||||
let mut yield_for_l0 = false;
|
||||
@@ -1417,22 +1418,33 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Suppress errors when cancelled.
|
||||
Err(_) if self.cancel.is_cancelled() => {}
|
||||
//
|
||||
// Log other errors but continue. Failure to repartition is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline as a simple
|
||||
// key-value store, ignoring the datadir layout. Log the error but continue.
|
||||
//
|
||||
// TODO:
|
||||
// 1. shouldn't we return early here if we observe cancellation
|
||||
// 2. Experiment: can we stop checking self.cancel here?
|
||||
Err(_) if self.cancel.is_cancelled() => {} // TODO: try how we fare removing this branch
|
||||
Err(err) if err.is_cancel() => {}
|
||||
|
||||
// Alert on critical errors that indicate data corruption.
|
||||
Err(err) if err.is_critical() => {
|
||||
Err(RepartitionError::CollectKeyspace(
|
||||
e @ CollectKeySpaceError::Decode(_)
|
||||
| e @ CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
|
||||
),
|
||||
)) => {
|
||||
// Alert on critical errors that indicate data corruption.
|
||||
critical_timeline!(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
"could not compact, repartitioning keyspace failed: {err:?}"
|
||||
"could not compact, repartitioning keyspace failed: {e:?}"
|
||||
);
|
||||
}
|
||||
|
||||
// Log other errors. No partitioning? This is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline as a simple
|
||||
// key-value store, ignoring the datadir layout. Log the error but continue.
|
||||
Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"),
|
||||
Err(e) => error!(
|
||||
"could not compact, repartitioning keyspace failed: {:?}",
|
||||
e.into_anyhow()
|
||||
),
|
||||
};
|
||||
|
||||
let partition_count = self.partitioning.read().0.0.parts.len();
|
||||
@@ -2518,7 +2530,10 @@ impl Timeline {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
|
||||
let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
|
||||
let (dense_ks, _sparse_ks) = self
|
||||
.collect_keyspace(end_lsn, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::from_collect_keyspace)?;
|
||||
// TODO(chi): ignore sparse_keyspace for now, compact it in the future.
|
||||
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
|
||||
|
||||
|
||||
@@ -212,8 +212,12 @@
|
||||
//! to the parent shard during a shard split. Eventually, the shard split task will
|
||||
//! shut down the parent => case (1).
|
||||
|
||||
use std::collections::{HashMap, hash_map};
|
||||
use std::sync::{Arc, Mutex, Weak};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::hash_map;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::Weak;
|
||||
use std::time::Duration;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use tracing::{instrument, trace};
|
||||
@@ -333,6 +337,44 @@ enum RoutingResult<T: Types> {
|
||||
}
|
||||
|
||||
impl<T: Types> Cache<T> {
|
||||
/* BEGIN_HADRON */
|
||||
/// A wrapper of do_get to resolve the tenant shard for a get page request.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(crate) async fn get(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
const GET_MAX_RETRIES: usize = 10;
|
||||
const RETRY_BACKOFF: Duration = Duration::from_millis(100);
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
attempt += 1;
|
||||
match self
|
||||
.do_get(timeline_id, shard_selector, tenant_manager)
|
||||
.await
|
||||
{
|
||||
Ok(handle) => return Ok(handle),
|
||||
Err(e) => {
|
||||
// Retry on tenant manager error to handle tenant split more gracefully
|
||||
if attempt < GET_MAX_RETRIES {
|
||||
tracing::warn!(
|
||||
"Fail to resolve tenant shard in attempt {}: {:?}. Retrying...",
|
||||
attempt,
|
||||
e
|
||||
);
|
||||
tokio::time::sleep(RETRY_BACKOFF).await;
|
||||
continue;
|
||||
} else {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
/// See module-level comment for details.
|
||||
///
|
||||
/// Does NOT check for the shutdown state of [`Types::Timeline`].
|
||||
@@ -341,7 +383,7 @@ impl<T: Types> Cache<T> {
|
||||
/// and if so, return an error that causes the page service to
|
||||
/// close the connection.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(crate) async fn get(
|
||||
async fn do_get(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
@@ -879,6 +921,7 @@ mod tests {
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown");
|
||||
|
||||
assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
|
||||
|
||||
cache
|
||||
|
||||
@@ -17,8 +17,6 @@ pub(crate) enum OffloadError {
|
||||
Cancelled,
|
||||
#[error("Timeline is not archived")]
|
||||
NotArchived,
|
||||
#[error(transparent)]
|
||||
RemoteStorage(anyhow::Error),
|
||||
#[error("Offload or deletion already in progress")]
|
||||
AlreadyInProgress,
|
||||
#[error("Unexpected offload error: {0}")]
|
||||
@@ -29,7 +27,7 @@ impl From<TenantManifestError> for OffloadError {
|
||||
fn from(e: TenantManifestError) -> Self {
|
||||
match e {
|
||||
TenantManifestError::Cancelled => Self::Cancelled,
|
||||
TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
|
||||
TenantManifestError::RemoteStorage(e) => Self::Other(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,12 +182,19 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
},
|
||||
|
||||
// If we've not received any updates from the broker from a while, are waiting for WAL
|
||||
// and have no safekeeper connection or connection candidates, then it might be that
|
||||
// the broker subscription is wedged. Drop the currrent subscription and re-subscribe
|
||||
// with the goal of unblocking it.
|
||||
_ = broker_reset_interval.tick() => {
|
||||
if wait_lsn_status.borrow().is_some() {
|
||||
tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...")
|
||||
}
|
||||
let awaiting_lsn = wait_lsn_status.borrow().is_some();
|
||||
let no_candidates = connection_manager_state.wal_stream_candidates.is_empty();
|
||||
let no_connection = connection_manager_state.wal_connection.is_none();
|
||||
|
||||
broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
if awaiting_lsn && no_candidates && no_connection {
|
||||
tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...");
|
||||
broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
}
|
||||
},
|
||||
|
||||
new_event = async {
|
||||
|
||||
@@ -45,9 +45,10 @@ pub(crate) fn regenerate(
|
||||
let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?;
|
||||
|
||||
// Fetch the fraction of disk space which may be used
|
||||
let disk_usable_pct = match conf.disk_usage_based_eviction.clone() {
|
||||
Some(e) => e.max_usage_pct,
|
||||
None => Percent::new(100).unwrap(),
|
||||
let disk_usable_pct = if conf.disk_usage_based_eviction.enabled {
|
||||
conf.disk_usage_based_eviction.max_usage_pct
|
||||
} else {
|
||||
Percent::new(100).unwrap()
|
||||
};
|
||||
|
||||
// Express a static value for how many shards we may schedule on one node
|
||||
|
||||
@@ -566,22 +566,55 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod harness {
|
||||
use super::PostgresRedoManager;
|
||||
use crate::config::PageServerConf;
|
||||
use utils::{id::TenantId, shard::TenantShardId};
|
||||
|
||||
pub struct RedoHarness {
|
||||
// underscored because unused, except for removal at drop
|
||||
_repo_dir: camino_tempfile::Utf8TempDir,
|
||||
pub manager: PostgresRedoManager,
|
||||
tenant_shard_id: TenantShardId,
|
||||
}
|
||||
|
||||
impl RedoHarness {
|
||||
pub fn new() -> anyhow::Result<Self> {
|
||||
crate::tenant::harness::setup_logging();
|
||||
|
||||
let repo_dir = camino_tempfile::tempdir()?;
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
|
||||
Ok(RedoHarness {
|
||||
_repo_dir: repo_dir,
|
||||
manager,
|
||||
tenant_shard_id,
|
||||
})
|
||||
}
|
||||
pub fn span(&self) -> tracing::Span {
|
||||
tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use tracing::Instrument;
|
||||
use utils::id::TenantId;
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
use super::PostgresRedoManager;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use crate::walredo::harness::RedoHarness;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ping() {
|
||||
@@ -692,33 +725,4 @@ mod tests {
|
||||
)
|
||||
]
|
||||
}
|
||||
|
||||
struct RedoHarness {
|
||||
// underscored because unused, except for removal at drop
|
||||
_repo_dir: camino_tempfile::Utf8TempDir,
|
||||
manager: PostgresRedoManager,
|
||||
tenant_shard_id: TenantShardId,
|
||||
}
|
||||
|
||||
impl RedoHarness {
|
||||
fn new() -> anyhow::Result<Self> {
|
||||
crate::tenant::harness::setup_logging();
|
||||
|
||||
let repo_dir = camino_tempfile::tempdir()?;
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
|
||||
Ok(RedoHarness {
|
||||
_repo_dir: repo_dir,
|
||||
manager,
|
||||
tenant_shard_id,
|
||||
})
|
||||
}
|
||||
fn span(&self) -> tracing::Span {
|
||||
tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,6 +65,7 @@
|
||||
#include "port/pg_iovec.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "replication/walsender.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "utils/timeout.h"
|
||||
|
||||
#include "bitmap.h"
|
||||
@@ -412,6 +413,47 @@ compact_prefetch_buffers(void)
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that prefetch response matches the slot
|
||||
*/
|
||||
static void
|
||||
check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
|
||||
{
|
||||
if (resp->tag != T_NeonGetPageResponse && resp->tag != T_NeonErrorResponse)
|
||||
{
|
||||
neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=%ld, ring_flush=%ld, ring_unused=%ld",
|
||||
resp->tag, MyPState->ring_receive, MyPState->ring_flush, MyPState->ring_unused);
|
||||
}
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
NRelFileInfo rinfo = BufTagGetNRelFileInfo(slot->buftag);
|
||||
if (resp->tag == T_NeonGetPageResponse)
|
||||
{
|
||||
NeonGetPageResponse * getpage_resp = (NeonGetPageResponse *)resp;
|
||||
if (resp->reqid != slot->reqid ||
|
||||
resp->lsn != slot->request_lsns.request_lsn ||
|
||||
resp->not_modified_since != slot->request_lsns.not_modified_since ||
|
||||
!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
|
||||
getpage_resp->req.forknum != slot->buftag.forkNum ||
|
||||
getpage_resp->req.blkno != slot->buftag.blockNum)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
|
||||
"Receive unexpected getpage response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
|
||||
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), slot->buftag.forkNum, slot->buftag.blockNum);
|
||||
}
|
||||
}
|
||||
else if (resp->reqid != slot->reqid ||
|
||||
resp->lsn != slot->request_lsns.request_lsn ||
|
||||
resp->not_modified_since != slot->request_lsns.not_modified_since)
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If there might be responses still in the TCP buffer, then we should try to
|
||||
* use those, to reduce any TCP backpressure on the OS/PS side.
|
||||
@@ -446,15 +488,18 @@ communicator_prefetch_pump_state(void)
|
||||
if (response == NULL)
|
||||
break;
|
||||
|
||||
check_getpage_response(slot, response);
|
||||
|
||||
/* The slot should still be valid */
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(slot->shard_no, ERROR,
|
||||
{
|
||||
neon_shard_log(slot->shard_no, PANIC,
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
|
||||
slot->status, slot->response,
|
||||
(long) slot->my_ring_index, (long) MyPState->ring_receive);
|
||||
|
||||
}
|
||||
/* update prefetch state */
|
||||
MyPState->n_responses_buffered += 1;
|
||||
MyPState->n_requests_inflight -= 1;
|
||||
@@ -593,6 +638,21 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Callback to be called on backend exit to ensure correct state of compute-PS communication
|
||||
* in case of backend cancel
|
||||
*/
|
||||
static void
|
||||
prefetch_on_exit(int code, Datum arg)
|
||||
{
|
||||
if (code != 0) /* do disconnect only on abnormal backend termination */
|
||||
{
|
||||
shardno_t shard_no = DatumGetInt32(arg);
|
||||
prefetch_on_ps_disconnect();
|
||||
page_server->disconnect(shard_no);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Make sure that there are no responses still in the buffer.
|
||||
@@ -605,6 +665,11 @@ consume_prefetch_responses(void)
|
||||
{
|
||||
if (MyPState->ring_receive < MyPState->ring_unused)
|
||||
prefetch_wait_for(MyPState->ring_unused - 1);
|
||||
/*
|
||||
* We know for sure we're not working on any prefetch pages after
|
||||
* this.
|
||||
*/
|
||||
END_PREFETCH_RECEIVE_WORK();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -722,10 +787,12 @@ prefetch_read(PrefetchRequest *slot)
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(slot->shard_no, ERROR,
|
||||
{
|
||||
neon_shard_log(slot->shard_no, PANIC,
|
||||
"Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
|
||||
slot->status, slot->response,
|
||||
(long)slot->my_ring_index, (long)MyPState->ring_receive);
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the request info so that if an error happens and the prefetch
|
||||
@@ -741,14 +808,18 @@ prefetch_read(PrefetchRequest *slot)
|
||||
MemoryContextSwitchTo(old);
|
||||
if (response)
|
||||
{
|
||||
check_getpage_response(slot, response);
|
||||
|
||||
/* The slot should still be valid */
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(shard_no, ERROR,
|
||||
{
|
||||
neon_shard_log(shard_no, PANIC,
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
|
||||
slot->status, slot->response,
|
||||
(long) slot->my_ring_index, (long) MyPState->ring_receive);
|
||||
}
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_responses_buffered += 1;
|
||||
@@ -820,11 +891,10 @@ communicator_prefetch_receive(BufferTag tag)
|
||||
void
|
||||
prefetch_on_ps_disconnect(void)
|
||||
{
|
||||
bool save_readpage_reentrant_guard = readpage_reentrant_guard;
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
|
||||
/* Prohibit callig of prefetch_pump_state */
|
||||
START_PREFETCH_RECEIVE_WORK();
|
||||
/* Nothing should cancel disconnect: we should not leave connection in opaque state */
|
||||
HOLD_INTERRUPTS();
|
||||
|
||||
while (MyPState->ring_receive < MyPState->ring_unused)
|
||||
{
|
||||
@@ -854,9 +924,6 @@ prefetch_on_ps_disconnect(void)
|
||||
MyNeonCounters->getpage_prefetch_discards_total += 1;
|
||||
}
|
||||
|
||||
/* Restore guard */
|
||||
readpage_reentrant_guard = save_readpage_reentrant_guard;
|
||||
|
||||
/*
|
||||
* We can have gone into retry due to network error, so update stats with
|
||||
* the latest available
|
||||
@@ -865,6 +932,8 @@ prefetch_on_ps_disconnect(void)
|
||||
MyPState->n_requests_inflight;
|
||||
MyNeonCounters->getpage_prefetches_buffered =
|
||||
MyPState->n_responses_buffered;
|
||||
|
||||
RESUME_INTERRUPTS();
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1027,16 +1096,11 @@ communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumbe
|
||||
/*
|
||||
* Ignore errors
|
||||
*/
|
||||
if (slot->response->tag != T_NeonGetPageResponse)
|
||||
if (slot->response->tag == T_NeonErrorResponse)
|
||||
{
|
||||
if (slot->response->tag != T_NeonErrorResponse)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
|
||||
"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
|
||||
T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
Assert(slot->response->tag == T_NeonGetPageResponse); /* checked by check_getpage_response when response was assigned to the slot */
|
||||
memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
|
||||
|
||||
|
||||
@@ -1351,7 +1415,7 @@ equal_requests(NeonRequest* a, NeonRequest* b)
|
||||
static NeonResponse *
|
||||
page_server_request(void const *req)
|
||||
{
|
||||
NeonResponse *resp;
|
||||
NeonResponse *resp = NULL;
|
||||
BufferTag tag = {0};
|
||||
shardno_t shard_no;
|
||||
|
||||
@@ -1371,7 +1435,7 @@ page_server_request(void const *req)
|
||||
tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
|
||||
break;
|
||||
default:
|
||||
neon_log(ERROR, "Unexpected request tag: %d", messageTag(req));
|
||||
neon_log(PANIC, "Unexpected request tag: %d", messageTag(req));
|
||||
}
|
||||
shard_no = get_shard_number(&tag);
|
||||
|
||||
@@ -1384,9 +1448,12 @@ page_server_request(void const *req)
|
||||
shard_no = 0;
|
||||
}
|
||||
|
||||
do
|
||||
consume_prefetch_responses();
|
||||
|
||||
PG_TRY();
|
||||
{
|
||||
PG_TRY();
|
||||
before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
|
||||
do
|
||||
{
|
||||
while (!page_server->send(shard_no, (NeonRequest *) req)
|
||||
|| !page_server->flush(shard_no))
|
||||
@@ -1394,30 +1461,24 @@ page_server_request(void const *req)
|
||||
/* do nothing */
|
||||
}
|
||||
MyNeonCounters->pageserver_open_requests++;
|
||||
consume_prefetch_responses();
|
||||
resp = page_server->receive(shard_no);
|
||||
MyNeonCounters->pageserver_open_requests--;
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
/*
|
||||
* Cancellation in this code needs to be handled better at some
|
||||
* point, but this currently seems fine for now.
|
||||
*/
|
||||
page_server->disconnect(shard_no);
|
||||
MyNeonCounters->pageserver_open_requests = 0;
|
||||
} while (resp == NULL);
|
||||
cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
|
||||
/* Nothing should cancel disconnect: we should not leave connection in opaque state */
|
||||
HOLD_INTERRUPTS();
|
||||
page_server->disconnect(shard_no);
|
||||
MyNeonCounters->pageserver_open_requests = 0;
|
||||
RESUME_INTERRUPTS();
|
||||
|
||||
/*
|
||||
* We know for sure we're not working on any prefetch pages after
|
||||
* this.
|
||||
*/
|
||||
END_PREFETCH_RECEIVE_WORK();
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
} while (resp == NULL);
|
||||
|
||||
return resp;
|
||||
}
|
||||
@@ -1502,7 +1563,7 @@ nm_pack_request(NeonRequest *msg)
|
||||
case T_NeonDbSizeResponse:
|
||||
case T_NeonGetSlruSegmentResponse:
|
||||
default:
|
||||
neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
|
||||
neon_log(PANIC, "unexpected neon message tag 0x%02x", msg->tag);
|
||||
break;
|
||||
}
|
||||
return s;
|
||||
@@ -1654,7 +1715,7 @@ nm_unpack_response(StringInfo s)
|
||||
case T_NeonDbSizeRequest:
|
||||
case T_NeonGetSlruSegmentRequest:
|
||||
default:
|
||||
neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
|
||||
neon_log(PANIC, "unexpected neon message tag 0x%02x", tag);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1983,7 +2044,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
|
||||
!RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) ||
|
||||
exists_resp->req.forknum != request.forknum)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
|
||||
@@ -2014,7 +2075,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
|
||||
T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
@@ -2158,6 +2219,7 @@ Retry:
|
||||
Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0);
|
||||
Assert(hashkey.buftag.blockNum == base_blockno + i);
|
||||
|
||||
/* We already checked that response match request when storing it in slot */
|
||||
resp = slot->response;
|
||||
|
||||
switch (resp->tag)
|
||||
@@ -2165,21 +2227,6 @@ Retry:
|
||||
case T_NeonGetPageResponse:
|
||||
{
|
||||
NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp;
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (resp->reqid != slot->reqid ||
|
||||
resp->lsn != slot->request_lsns.request_lsn ||
|
||||
resp->not_modified_since != slot->request_lsns.not_modified_since ||
|
||||
!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
|
||||
getpage_resp->req.forknum != forkNum ||
|
||||
getpage_resp->req.blkno != base_blockno + i)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
|
||||
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i);
|
||||
}
|
||||
}
|
||||
memcpy(buffer, getpage_resp->page, BLCKSZ);
|
||||
|
||||
/*
|
||||
@@ -2192,17 +2239,6 @@ Retry:
|
||||
break;
|
||||
}
|
||||
case T_NeonErrorResponse:
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (resp->reqid != slot->reqid ||
|
||||
resp->lsn != slot->request_lsns.request_lsn ||
|
||||
resp->not_modified_since != slot->request_lsns.not_modified_since)
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
|
||||
}
|
||||
}
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
@@ -2257,7 +2293,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
|
||||
!RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) ||
|
||||
relsize_resp->req.forknum != forknum)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
|
||||
@@ -2288,7 +2324,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
|
||||
T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
@@ -2327,7 +2363,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
|
||||
if (!equal_requests(resp, &request.hdr) ||
|
||||
dbsize_resp->req.dbNode != dbNode)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
|
||||
@@ -2356,7 +2392,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
|
||||
T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
@@ -2372,7 +2408,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
|
||||
{
|
||||
int n_blocks;
|
||||
shardno_t shard_no = 0; /* All SLRUs are at shard 0 */
|
||||
NeonResponse *resp;
|
||||
NeonResponse *resp = NULL;
|
||||
NeonGetSlruSegmentRequest request;
|
||||
|
||||
request = (NeonGetSlruSegmentRequest) {
|
||||
@@ -2383,14 +2419,29 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
|
||||
.segno = segno
|
||||
};
|
||||
|
||||
do
|
||||
consume_prefetch_responses();
|
||||
|
||||
PG_TRY();
|
||||
{
|
||||
while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
|
||||
before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
|
||||
do
|
||||
{
|
||||
while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
|
||||
resp = page_server->receive(shard_no);
|
||||
} while (resp == NULL);
|
||||
cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
|
||||
/* Nothing should cancel disconnect: we should not leave connection in opaque state */
|
||||
HOLD_INTERRUPTS();
|
||||
page_server->disconnect(shard_no);
|
||||
RESUME_INTERRUPTS();
|
||||
|
||||
consume_prefetch_responses();
|
||||
|
||||
resp = page_server->receive(shard_no);
|
||||
} while (resp == NULL);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
switch (resp->tag)
|
||||
{
|
||||
@@ -2403,7 +2454,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
|
||||
slru_resp->req.kind != kind ||
|
||||
slru_resp->req.segno != segno)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno);
|
||||
@@ -2435,7 +2486,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
NEON_PANIC_CONNECTION_STATE(0, PANIC,
|
||||
"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
|
||||
T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use measured::{
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
|
||||
MetricGroup,
|
||||
};
|
||||
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
|
||||
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLogVec};
|
||||
use tokio::time::{self, Instant};
|
||||
|
||||
use crate::control_plane::messages::ColdStartInfo;
|
||||
@@ -36,7 +36,6 @@ impl Metrics {
|
||||
metrics.proxy.redis_errors_total.init_all_dense();
|
||||
metrics.proxy.redis_events_count.init_all_dense();
|
||||
metrics.proxy.retries_metric.init_all_dense();
|
||||
metrics.proxy.invalid_endpoints_total.init_all_dense();
|
||||
metrics.proxy.connection_failures_total.init_all_dense();
|
||||
|
||||
SELF.set(metrics)
|
||||
@@ -80,11 +79,6 @@ pub struct ProxyMetrics {
|
||||
)]
|
||||
pub console_request_latency: HistogramVec<ConsoleRequestSet, 16>,
|
||||
|
||||
/// Time it takes to acquire a token to call console plane.
|
||||
// largest bucket = 3^16 * 0.05ms = 2.15s
|
||||
#[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))]
|
||||
pub control_plane_token_acquire_seconds: Histogram<16>,
|
||||
|
||||
/// Size of the HTTP request body lengths.
|
||||
// smallest bucket = 16 bytes
|
||||
// largest bucket = 4^12 * 16 bytes = 256MB
|
||||
@@ -98,19 +92,10 @@ pub struct ProxyMetrics {
|
||||
/// Number of opened connections to a database.
|
||||
pub http_pool_opened_connections: Gauge,
|
||||
|
||||
/// Number of cache hits/misses for allowed ips.
|
||||
pub allowed_ips_cache_misses: CounterVec<StaticLabelSet<CacheOutcome>>,
|
||||
|
||||
/// Number of allowed ips
|
||||
#[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
|
||||
pub allowed_ips_number: Histogram<10>,
|
||||
|
||||
/// Number of cache hits/misses for VPC endpoint IDs.
|
||||
pub vpc_endpoint_id_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
|
||||
|
||||
/// Number of cache hits/misses for access blocker flags.
|
||||
pub access_blocker_flags_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
|
||||
|
||||
/// Number of allowed VPC endpoints IDs
|
||||
#[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
|
||||
pub allowed_vpc_endpoint_ids: Histogram<10>,
|
||||
@@ -139,21 +124,12 @@ pub struct ProxyMetrics {
|
||||
/// Number of TLS handshake failures
|
||||
pub tls_handshake_failures: Counter,
|
||||
|
||||
/// Number of connection requests affected by authentication rate limits
|
||||
pub requests_auth_rate_limits_total: Counter,
|
||||
|
||||
/// HLL approximate cardinality of endpoints that are connecting
|
||||
pub connecting_endpoints: HyperLogLogVec<StaticLabelSet<Protocol>, 32>,
|
||||
|
||||
/// Number of endpoints affected by errors of a given classification
|
||||
pub endpoints_affected_by_errors: HyperLogLogVec<StaticLabelSet<crate::error::ErrorKind>, 32>,
|
||||
|
||||
/// Number of endpoints affected by authentication rate limits
|
||||
pub endpoints_auth_rate_limits: HyperLogLog<32>,
|
||||
|
||||
/// Number of invalid endpoints (per protocol, per rejected).
|
||||
pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
|
||||
|
||||
/// Number of retries (per outcome, per retry_type).
|
||||
#[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
|
||||
pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
|
||||
|
||||
@@ -561,6 +561,20 @@ impl InterpretedWalReader {
|
||||
// Update internal and external state, then reset the WAL stream
|
||||
// if required.
|
||||
let senders = self.shard_senders.entry(shard_id).or_default();
|
||||
|
||||
// Clean up any shard senders that have dropped out before adding the new
|
||||
// one. This avoids a build up of dead senders.
|
||||
senders.retain(|sender| {
|
||||
let closed = sender.tx.is_closed();
|
||||
|
||||
if closed {
|
||||
let sender_id = ShardSenderId::new(shard_id, sender.sender_id);
|
||||
tracing::info!("Removed shard sender {}", sender_id);
|
||||
}
|
||||
|
||||
!closed
|
||||
});
|
||||
|
||||
let new_sender_id = match senders.last() {
|
||||
Some(sender) => sender.sender_id.next(),
|
||||
None => SenderId::first()
|
||||
|
||||
@@ -1677,7 +1677,21 @@ impl Service {
|
||||
.collect::<anyhow::Result<Vec<_>>>()?;
|
||||
let safekeepers: HashMap<NodeId, Safekeeper> =
|
||||
safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
|
||||
tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
|
||||
let count_policy = |policy| {
|
||||
safekeepers
|
||||
.iter()
|
||||
.filter(|sk| sk.1.scheduling_policy() == policy)
|
||||
.count()
|
||||
};
|
||||
let active_sk_count = count_policy(SkSchedulingPolicy::Active);
|
||||
let activating_sk_count = count_policy(SkSchedulingPolicy::Activating);
|
||||
let pause_sk_count = count_policy(SkSchedulingPolicy::Pause);
|
||||
let decom_sk_count = count_policy(SkSchedulingPolicy::Decomissioned);
|
||||
tracing::info!(
|
||||
"Loaded {} safekeepers from database. Active {active_sk_count}, activating {activating_sk_count}, \
|
||||
paused {pause_sk_count}, decomissioned {decom_sk_count}.",
|
||||
safekeepers.len()
|
||||
);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_safekeeper_nodes
|
||||
@@ -1969,6 +1983,14 @@ impl Service {
|
||||
}
|
||||
});
|
||||
|
||||
// Check that there is enough safekeepers configured that we can create new timelines
|
||||
let test_sk_res = this.safekeepers_for_new_timeline().await;
|
||||
tracing::info!(
|
||||
timeline_safekeeper_count = config.timeline_safekeeper_count,
|
||||
timelines_onto_safekeepers = config.timelines_onto_safekeepers,
|
||||
"viability test result (test timeline creation on safekeepers): {test_sk_res:?}",
|
||||
);
|
||||
|
||||
Ok(this)
|
||||
}
|
||||
|
||||
@@ -7208,6 +7230,12 @@ impl Service {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
// Calculate a schedule context here to avoid borrow checker issues.
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
for (_, shard) in tenants.range(TenantShardId::tenant_range(tid.tenant_id)) {
|
||||
schedule_context.avoid(&shard.intent.all_pageservers());
|
||||
}
|
||||
|
||||
let tenant_shard = match tenants.get_mut(&tid) {
|
||||
Some(tenant_shard) => tenant_shard,
|
||||
None => {
|
||||
@@ -7233,9 +7261,6 @@ impl Service {
|
||||
}
|
||||
|
||||
if tenant_shard.deref_node(node_id) {
|
||||
// TODO(ephemeralsad): we should process all shards in a tenant at once, so
|
||||
// we can avoid settling the tenant unevenly.
|
||||
let mut schedule_context = ScheduleContext::new(ScheduleMode::Normal);
|
||||
if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) {
|
||||
tracing::error!(
|
||||
"Refusing to delete node, shard {} can't be rescheduled: {e}",
|
||||
|
||||
@@ -2,11 +2,12 @@ from __future__ import annotations
|
||||
|
||||
import urllib.parse
|
||||
from enum import StrEnum
|
||||
from typing import TYPE_CHECKING, final
|
||||
from typing import TYPE_CHECKING, Any, final
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.auth import AuthBase
|
||||
from requests.exceptions import ReadTimeout
|
||||
from typing_extensions import override
|
||||
|
||||
from fixtures.log_helper import log
|
||||
@@ -102,6 +103,18 @@ class EndpointHttpClient(requests.Session):
|
||||
|
||||
wait_until(offloaded)
|
||||
|
||||
def promote(self, safekeepers_lsn: dict[str, Any], disconnect: bool = False):
|
||||
url = f"http://localhost:{self.external_port}/promote"
|
||||
if disconnect:
|
||||
try: # send first request to start promote and disconnect
|
||||
self.post(url, data=safekeepers_lsn, timeout=0.001)
|
||||
except ReadTimeout:
|
||||
pass # wait on second request which returns on promotion finish
|
||||
res = self.post(url, data=safekeepers_lsn)
|
||||
res.raise_for_status()
|
||||
json: dict[str, str] = res.json()
|
||||
return json
|
||||
|
||||
def database_schema(self, database: str):
|
||||
res = self.get(
|
||||
f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}",
|
||||
|
||||
@@ -159,6 +159,9 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
|
||||
)
|
||||
|
||||
PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
||||
# BEGIN_HADRON
|
||||
"pageserver_active_storage_operations_count",
|
||||
# END_HADRON
|
||||
"pageserver_current_logical_size",
|
||||
"pageserver_resident_physical_size",
|
||||
"pageserver_io_operations_bytes_total",
|
||||
|
||||
@@ -111,6 +111,14 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
".*stalling layer flushes for compaction backpressure.*",
|
||||
".*layer roll waiting for flush due to compaction backpressure.*",
|
||||
".*BatchSpanProcessor.*",
|
||||
# Can happen in tests that purposely wipe pageserver "local disk" data.
|
||||
".*Local data loss suspected.*",
|
||||
# Too many frozen layers error is normal during intensive benchmarks
|
||||
".*too many frozen layers.*",
|
||||
# Transient errors when resolving tenant shards by page service
|
||||
".*Fail to resolve tenant shard in attempt.*",
|
||||
# Expected warnings when pageserver has not refreshed GC info yet
|
||||
".*pitr LSN/interval not found, skipping force image creation LSN calculation.*",
|
||||
".*No broker updates received for a while.*",
|
||||
*(
|
||||
[
|
||||
|
||||
@@ -1247,3 +1247,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def force_refresh_feature_flag(self, tenant_id: TenantId | TenantShardId):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/force_refresh_feature_flag",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
@@ -71,7 +71,13 @@ def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_man
|
||||
n_clients: int,
|
||||
):
|
||||
setup_and_run_pagebench_benchmark(
|
||||
neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients
|
||||
neon_env_builder,
|
||||
zenbenchmark,
|
||||
pg_bin,
|
||||
n_tenants,
|
||||
pgbench_scale,
|
||||
duration,
|
||||
n_clients,
|
||||
)
|
||||
|
||||
|
||||
@@ -86,7 +92,8 @@ def setup_and_run_pagebench_benchmark(
|
||||
):
|
||||
def record(metric, **kwargs):
|
||||
zenbenchmark.record(
|
||||
metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs
|
||||
metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}",
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
params: dict[str, tuple[Any, dict[str, Any]]] = {}
|
||||
@@ -104,7 +111,7 @@ def setup_and_run_pagebench_benchmark(
|
||||
# configure cache sizes like in prod
|
||||
page_cache_size = 16384
|
||||
max_file_descriptors = 500000
|
||||
neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{max_usage_pct=99, min_avail_bytes=0, period = '999y'}}"
|
||||
neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{enabled = false}}"
|
||||
|
||||
tracing_config = PageserverTracingConfig(
|
||||
sampling_ratio=(0, 1000),
|
||||
@@ -120,7 +127,10 @@ def setup_and_run_pagebench_benchmark(
|
||||
page_cache_size * 8192,
|
||||
{"unit": "byte"},
|
||||
),
|
||||
"pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
|
||||
"pageserver_config_override.max_file_descriptors": (
|
||||
max_file_descriptors,
|
||||
{"unit": ""},
|
||||
),
|
||||
"pageserver_config_override.sampling_ratio": (ratio, {"unit": ""}),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -49,3 +49,12 @@ def test_feature_flag(neon_env_builder: NeonEnvBuilder):
|
||||
env.initial_tenant, "test-feature-flag"
|
||||
)["result"]
|
||||
)
|
||||
|
||||
env.pageserver.http_client().force_refresh_feature_flag(env.initial_tenant)
|
||||
|
||||
# Check if the properties exist
|
||||
result = env.pageserver.http_client().evaluate_feature_flag_multivariate(
|
||||
env.initial_tenant, "test-feature-flag"
|
||||
)
|
||||
assert "tenant_remote_size_mb" in result["properties"]
|
||||
assert "tenant_id" in result["properties"]
|
||||
|
||||
@@ -1,29 +1,51 @@
|
||||
"""
|
||||
File with secondary->primary promotion testing.
|
||||
|
||||
This far, only contains a test that we don't break and that the data is persisted.
|
||||
Secondary -> primary promotion testing
|
||||
"""
|
||||
|
||||
from enum import StrEnum
|
||||
from typing import cast
|
||||
|
||||
import psycopg2
|
||||
import pytest
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.utils import USE_LFC
|
||||
from psycopg2.extensions import cursor as Cursor
|
||||
from pytest import raises
|
||||
|
||||
|
||||
def stop_and_check_lsn(ep: Endpoint, expected_lsn: Lsn | None):
|
||||
ep.stop(mode="immediate-terminate")
|
||||
lsn = ep.terminate_flush_lsn
|
||||
if expected_lsn is not None:
|
||||
assert (lsn is not None) == (expected_lsn is not None), f"{lsn=}, {expected_lsn=}"
|
||||
if lsn is not None:
|
||||
assert lsn >= expected_lsn, f"{expected_lsn=} < {lsn=}"
|
||||
else:
|
||||
assert lsn == expected_lsn, f"{expected_lsn=} != {lsn=}"
|
||||
|
||||
|
||||
def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
def get_lsn_triple(cur: Cursor) -> tuple[str, str, str]:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
return cast("tuple[str, str, str]", cur.fetchone())
|
||||
|
||||
|
||||
class PromoteMethod(StrEnum):
|
||||
COMPUTE_CTL = "compute-ctl"
|
||||
POSTGRES = "postgres"
|
||||
|
||||
|
||||
METHOD_OPTIONS = [e for e in PromoteMethod]
|
||||
METHOD_IDS = [e.value for e in PromoteMethod]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
@pytest.mark.parametrize("method", METHOD_OPTIONS, ids=METHOD_IDS)
|
||||
def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
|
||||
"""
|
||||
Test that a replica safely promotes, and can commit data updates which
|
||||
show up when the primary boots up after the promoted secondary endpoint
|
||||
@@ -38,29 +60,26 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
|
||||
with primary.connect() as primary_conn:
|
||||
primary_cur = primary_conn.cursor()
|
||||
primary_cur.execute("create extension neon")
|
||||
primary_cur.execute(
|
||||
"create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
|
||||
)
|
||||
primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
|
||||
primary_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
lsn_triple = cast("tuple[str, str, str]", primary_cur.fetchone())
|
||||
|
||||
lsn_triple = get_lsn_triple(primary_cur)
|
||||
log.info(f"Primary: Current LSN after workload is {lsn_triple}")
|
||||
expected_primary_lsn: Lsn = Lsn(lsn_triple[2])
|
||||
primary_cur.execute("show neon.safekeepers")
|
||||
safekeepers = primary_cur.fetchall()[0][0]
|
||||
|
||||
wait_replica_caughtup(primary, secondary)
|
||||
if method == PromoteMethod.COMPUTE_CTL:
|
||||
primary.http_client().offload_lfc()
|
||||
else:
|
||||
wait_replica_caughtup(primary, secondary)
|
||||
|
||||
with secondary.connect() as secondary_conn:
|
||||
secondary_cur = secondary_conn.cursor()
|
||||
secondary_cur.execute("select count(*) from t")
|
||||
|
||||
assert secondary_cur.fetchone() == (100,)
|
||||
|
||||
with raises(psycopg2.Error):
|
||||
@@ -71,28 +90,30 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
secondary_cur.execute("select count(*) from t")
|
||||
assert secondary_cur.fetchone() == (100,)
|
||||
|
||||
primary_endpoint_id = primary.endpoint_id
|
||||
stop_and_check_lsn(primary, expected_primary_lsn)
|
||||
|
||||
# Reconnect to the secondary to make sure we get a read-write connection
|
||||
promo_conn = secondary.connect()
|
||||
promo_cur = promo_conn.cursor()
|
||||
promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
|
||||
promo_cur.execute("select pg_reload_conf()")
|
||||
if method == PromoteMethod.COMPUTE_CTL:
|
||||
client = secondary.http_client()
|
||||
client.prewarm_lfc(primary_endpoint_id)
|
||||
# control plane knows safekeepers, simulate it by querying primary
|
||||
assert (lsn := primary.terminate_flush_lsn)
|
||||
safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
|
||||
assert client.promote(safekeepers_lsn)["status"] == "completed"
|
||||
else:
|
||||
promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
|
||||
promo_cur.execute("select pg_reload_conf()")
|
||||
promo_cur.execute("SELECT * FROM pg_promote()")
|
||||
assert promo_cur.fetchone() == (True,)
|
||||
|
||||
promo_cur.execute("SELECT * FROM pg_promote()")
|
||||
assert promo_cur.fetchone() == (True,)
|
||||
promo_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}")
|
||||
lsn_triple = get_lsn_triple(promo_cur)
|
||||
log.info(f"Secondary: LSN after promotion is {lsn_triple}")
|
||||
|
||||
# Reconnect to the secondary to make sure we get a read-write connection
|
||||
with secondary.connect() as new_primary_conn:
|
||||
new_primary_cur = new_primary_conn.cursor()
|
||||
with secondary.connect() as conn, conn.cursor() as new_primary_cur:
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (100,)
|
||||
|
||||
@@ -101,43 +122,34 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
)
|
||||
assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)]
|
||||
|
||||
new_primary_cur = new_primary_conn.cursor()
|
||||
new_primary_cur = conn.cursor()
|
||||
new_primary_cur.execute("select payload from t")
|
||||
assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
|
||||
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (200,)
|
||||
new_primary_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}")
|
||||
|
||||
with secondary.connect() as second_viewpoint_conn:
|
||||
new_primary_cur = second_viewpoint_conn.cursor()
|
||||
lsn_triple = get_lsn_triple(new_primary_cur)
|
||||
log.info(f"Secondary: LSN after workload is {lsn_triple}")
|
||||
expected_promoted_lsn = Lsn(lsn_triple[2])
|
||||
|
||||
with secondary.connect() as conn, conn.cursor() as new_primary_cur:
|
||||
new_primary_cur.execute("select payload from t")
|
||||
assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
|
||||
|
||||
# wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# secondaries don't sync safekeepers on finish so LSN will be None
|
||||
stop_and_check_lsn(secondary, None)
|
||||
if method == PromoteMethod.COMPUTE_CTL:
|
||||
# compute_ctl's /promote switches replica type to Primary so it syncs
|
||||
# safekeepers on finish
|
||||
stop_and_check_lsn(secondary, expected_promoted_lsn)
|
||||
else:
|
||||
# on testing postgres, we don't update replica type, secondaries don't
|
||||
# sync so lsn should be None
|
||||
stop_and_check_lsn(secondary, None)
|
||||
|
||||
primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary2")
|
||||
|
||||
with primary.connect() as new_primary:
|
||||
new_primary_cur = new_primary.cursor()
|
||||
new_primary_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
lsn_triple = cast("tuple[str, str, str]", new_primary_cur.fetchone())
|
||||
with primary.connect() as new_primary, new_primary.cursor() as new_primary_cur:
|
||||
lsn_triple = get_lsn_triple(new_primary_cur)
|
||||
expected_primary_lsn = Lsn(lsn_triple[2])
|
||||
log.info(f"New primary: Boot LSN is {lsn_triple}")
|
||||
|
||||
@@ -146,5 +158,39 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)")
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (300,)
|
||||
|
||||
stop_and_check_lsn(primary, expected_primary_lsn)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test that if a handler disconnects from /promote route of compute_ctl, promotion still happens
|
||||
once, and no error is thrown
|
||||
"""
|
||||
env: NeonEnv = neon_simple_env
|
||||
primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
|
||||
secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
|
||||
|
||||
with primary.connect() as conn, conn.cursor() as cur:
|
||||
cur.execute("create extension neon")
|
||||
cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
|
||||
cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
|
||||
cur.execute("show neon.safekeepers")
|
||||
safekeepers = cur.fetchall()[0][0]
|
||||
|
||||
primary.http_client().offload_lfc()
|
||||
primary_endpoint_id = primary.endpoint_id
|
||||
primary.stop(mode="immediate-terminate")
|
||||
assert (lsn := primary.terminate_flush_lsn)
|
||||
|
||||
client = secondary.http_client()
|
||||
client.prewarm_lfc(primary_endpoint_id)
|
||||
safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
|
||||
assert client.promote(safekeepers_lsn, disconnect=True)["status"] == "completed"
|
||||
|
||||
with secondary.connect() as conn, conn.cursor() as cur:
|
||||
cur.execute("select count(*) from t")
|
||||
assert cur.fetchone() == (100,)
|
||||
cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload")
|
||||
cur.execute("select count(*) from t")
|
||||
assert cur.fetchone() == (200,)
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from threading import Event
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import pytest
|
||||
@@ -1505,6 +1508,171 @@ def test_sharding_split_failures(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="The backpressure change has not been merged yet.")
|
||||
def test_back_pressure_during_split(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test backpressure can ignore new shards during tenant split so that if we abort the split,
|
||||
PG can continue without being blocked.
|
||||
"""
|
||||
DBNAME = "regression"
|
||||
|
||||
init_shard_count = 4
|
||||
neon_env_builder.num_pageservers = init_shard_count
|
||||
stripe_size = 32
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size
|
||||
)
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
# All split failures log a warning when then enqueue the abort operation
|
||||
".*Enqueuing background abort.*",
|
||||
# Tolerate any error lots that mention a failpoint
|
||||
".*failpoint.*",
|
||||
]
|
||||
)
|
||||
|
||||
endpoint = env.endpoints.create(
|
||||
"main",
|
||||
config_lines=[
|
||||
"max_replication_write_lag = 1MB",
|
||||
"databricks.max_wal_mb_per_second = 1",
|
||||
"neon.max_cluster_size = 10GB",
|
||||
],
|
||||
)
|
||||
endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created.
|
||||
endpoint.start()
|
||||
|
||||
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
||||
|
||||
endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);")
|
||||
write_done = Event()
|
||||
|
||||
def write_data(write_done):
|
||||
while not write_done.is_set():
|
||||
endpoint.safe_psql(
|
||||
"INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
|
||||
)
|
||||
log.info("write_data thread exiting")
|
||||
|
||||
writer_thread = threading.Thread(target=write_data, args=(write_done,))
|
||||
writer_thread.start()
|
||||
|
||||
env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)"))
|
||||
# split the tenant
|
||||
with pytest.raises(StorageControllerApiException):
|
||||
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16)
|
||||
|
||||
write_done.set()
|
||||
writer_thread.join()
|
||||
|
||||
# writing more data to page servers after split is aborted
|
||||
for _i in range(5000):
|
||||
endpoint.safe_psql(
|
||||
"INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
|
||||
)
|
||||
|
||||
# wait until write lag becomes 0
|
||||
def check_write_lag_is_zero():
|
||||
res = endpoint.safe_psql(
|
||||
"""
|
||||
SELECT
|
||||
pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag
|
||||
FROM neon.backpressure_lsns();
|
||||
""",
|
||||
dbname="databricks_system",
|
||||
log_query=False,
|
||||
)
|
||||
log.info(f"received_lsn_lag = {res[0][0]}")
|
||||
assert res[0][0] == 0
|
||||
|
||||
wait_until(check_write_lag_is_zero)
|
||||
endpoint.stop_and_destroy()
|
||||
|
||||
|
||||
# BEGIN_HADRON
|
||||
def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Tests that page service is able to resolve the correct shard during tenant split without causing query errors
|
||||
"""
|
||||
DBNAME = "regression"
|
||||
WORKER_THREADS = 16
|
||||
ROW_COUNT = 10000
|
||||
|
||||
init_shard_count = 4
|
||||
neon_env_builder.num_pageservers = 1
|
||||
stripe_size = 16
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size
|
||||
)
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
# All split failures log a warning when then enqueue the abort operation
|
||||
".*Enqueuing background abort.*",
|
||||
# Tolerate any error lots that mention a failpoint
|
||||
".*failpoint.*",
|
||||
]
|
||||
)
|
||||
|
||||
endpoint = env.endpoints.create("main")
|
||||
endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created.
|
||||
endpoint.start()
|
||||
|
||||
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
||||
|
||||
# generate 10MB of data
|
||||
endpoint.safe_psql(
|
||||
f"CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, {ROW_COUNT}) s;"
|
||||
)
|
||||
read_done = Event()
|
||||
|
||||
def read_data(read_done):
|
||||
i = 0
|
||||
while not read_done.is_set() or i < 10:
|
||||
endpoint.safe_psql(
|
||||
f"SELECT * FROM usertable where KEY = {random.randint(1, ROW_COUNT)}",
|
||||
log_query=False,
|
||||
)
|
||||
i += 1
|
||||
log.info(f"read_data thread exiting. Executed {i} queries.")
|
||||
|
||||
reader_threads = []
|
||||
for _i in range(WORKER_THREADS):
|
||||
reader_thread = threading.Thread(target=read_data, args=(read_done,))
|
||||
reader_thread.start()
|
||||
reader_threads.append(reader_thread)
|
||||
|
||||
env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)"))
|
||||
# split the tenant
|
||||
with pytest.raises(StorageControllerApiException):
|
||||
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16)
|
||||
|
||||
# wait until abort is done
|
||||
def check_tenant_status():
|
||||
active_count = 0
|
||||
for i in range(init_shard_count):
|
||||
status = env.pageserver.http_client().tenant_status(
|
||||
TenantShardId(env.initial_tenant, i, init_shard_count)
|
||||
)
|
||||
if status["state"]["slug"] == "Active":
|
||||
active_count += 1
|
||||
assert active_count == 4
|
||||
|
||||
wait_until(check_tenant_status)
|
||||
|
||||
read_done.set()
|
||||
for thread in reader_threads:
|
||||
thread.join()
|
||||
|
||||
endpoint.stop()
|
||||
|
||||
|
||||
# END_HADRON
|
||||
|
||||
|
||||
def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Check a scenario when one of the shards is much slower than others.
|
||||
|
||||
@@ -332,7 +332,7 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
|
||||
|
||||
last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();")
|
||||
|
||||
def start_publisher_workload(table_num: int, duration: int):
|
||||
def start_publisher_workload(i: int, duration: int):
|
||||
start = time.time()
|
||||
with endpoint.cursor(dbname="publisher_db") as cur:
|
||||
while time.time() - start < duration:
|
||||
|
||||
Reference in New Issue
Block a user