mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-31 01:00:36 +00:00
Compare commits
27 Commits
jcsp/ha-te
...
lr-tests-c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5598f712fa | ||
|
|
83855a907c | ||
|
|
1b41db8bdd | ||
|
|
bac06ea1ac | ||
|
|
7ae8364b0b | ||
|
|
1f7d54f987 | ||
|
|
580e136b2e | ||
|
|
09699d4bd8 | ||
|
|
89cf714890 | ||
|
|
621ea2ec44 | ||
|
|
74d09b78c7 | ||
|
|
0cf0731d8b | ||
|
|
98723844ee | ||
|
|
73a8c97ac8 | ||
|
|
17a3c9036e | ||
|
|
8c5b310090 | ||
|
|
8224580f3e | ||
|
|
2b0f3549f7 | ||
|
|
b4972d07d4 | ||
|
|
26ae7b0b3e | ||
|
|
f8483cc4a3 | ||
|
|
cc5d6c66b3 | ||
|
|
d894d2b450 | ||
|
|
b09d686335 | ||
|
|
74d24582cf | ||
|
|
4834d22d2d | ||
|
|
86e8c43ddf |
4
.github/workflows/build_and_test.yml
vendored
4
.github/workflows/build_and_test.yml
vendored
@@ -474,7 +474,7 @@ jobs:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
@@ -554,7 +554,7 @@ jobs:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
|
||||
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -282,10 +282,8 @@ dependencies = [
|
||||
"control_plane",
|
||||
"diesel",
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
|
||||
2
Makefile
2
Makefile
@@ -51,7 +51,7 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
|
||||
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
||||
# Force cargo not to print progress bar
|
||||
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
|
||||
# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
|
||||
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
|
||||
|
||||
#
|
||||
|
||||
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use nix::unistd::Pid;
|
||||
use postgres::error::SqlState;
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
@@ -722,8 +723,12 @@ impl ComputeNode {
|
||||
// Stop it when it's ready
|
||||
info!("waiting for postgres");
|
||||
wait_for_postgres(&mut pg, Path::new(pgdata))?;
|
||||
pg.kill()?;
|
||||
info!("sent kill signal");
|
||||
// SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL
|
||||
// it to avoid orphaned processes prowling around while datadir is
|
||||
// wiped.
|
||||
let pm_pid = Pid::from_raw(pg.id() as i32);
|
||||
kill(pm_pid, Signal::SIGQUIT)?;
|
||||
info!("sent SIGQUIT signal");
|
||||
pg.wait()?;
|
||||
info!("done prewarming");
|
||||
|
||||
|
||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
RoleAction::Create => {
|
||||
// This branch only runs when roles are created through the console, so it is
|
||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||
// from neon_superuser.
|
||||
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("running role create query: '{}'", &query);
|
||||
@@ -805,6 +805,18 @@ $$;"#,
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
END LOOP;
|
||||
END
|
||||
$$;"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
|
||||
@@ -19,10 +19,8 @@ aws-config.workspace = true
|
||||
aws-sdk-secretsmanager.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::sync::Arc;
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
|
||||
@@ -24,13 +23,10 @@ struct ShardedComputeHookTenant {
|
||||
stripe_size: ShardStripeSize,
|
||||
shard_count: ShardCount,
|
||||
shards: Vec<(ShardNumber, NodeId)>,
|
||||
|
||||
// Async lock used for ensuring that remote compute hook calls are ordered identically to updates to this structure
|
||||
lock: Arc<tokio::sync::Mutex<()>>,
|
||||
}
|
||||
|
||||
enum ComputeHookTenant {
|
||||
Unsharded((NodeId, Arc<tokio::sync::Mutex<()>>)),
|
||||
Unsharded(NodeId),
|
||||
Sharded(ShardedComputeHookTenant),
|
||||
}
|
||||
|
||||
@@ -42,17 +38,9 @@ impl ComputeHookTenant {
|
||||
shards: vec![(tenant_shard_id.shard_number, node_id)],
|
||||
stripe_size,
|
||||
shard_count: tenant_shard_id.shard_count,
|
||||
lock: Arc::default(),
|
||||
})
|
||||
} else {
|
||||
Self::Unsharded((node_id, Arc::default()))
|
||||
}
|
||||
}
|
||||
|
||||
fn get_lock(&self) -> &Arc<tokio::sync::Mutex<()>> {
|
||||
match self {
|
||||
Self::Unsharded((_node_id, lock)) => lock,
|
||||
Self::Sharded(sharded_tenant) => &sharded_tenant.lock,
|
||||
Self::Unsharded(node_id)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,9 +53,7 @@ impl ComputeHookTenant {
|
||||
node_id: NodeId,
|
||||
) {
|
||||
match self {
|
||||
Self::Unsharded((existing_node_id, _lock))
|
||||
if tenant_shard_id.shard_count.count() == 1 =>
|
||||
{
|
||||
Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
|
||||
*existing_node_id = node_id
|
||||
}
|
||||
Self::Sharded(sharded_tenant)
|
||||
@@ -136,15 +122,9 @@ pub(crate) enum NotifyError {
|
||||
}
|
||||
|
||||
impl ComputeHookTenant {
|
||||
fn maybe_reconfigure(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Option<(
|
||||
ComputeHookNotifyRequest,
|
||||
impl std::future::Future<Output = tokio::sync::OwnedMutexGuard<()>>,
|
||||
)> {
|
||||
let request = match self {
|
||||
Self::Unsharded((node_id, _lock)) => Some(ComputeHookNotifyRequest {
|
||||
fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
|
||||
match self {
|
||||
Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
|
||||
tenant_id,
|
||||
shards: vec![ComputeHookNotifyRequestShard {
|
||||
shard_number: ShardNumber(0),
|
||||
@@ -178,9 +158,7 @@ impl ComputeHookTenant {
|
||||
);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
request.map(|r| (r, self.get_lock().clone().lock_owned()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -189,11 +167,8 @@ impl ComputeHookTenant {
|
||||
/// the compute connection string.
|
||||
pub(super) struct ComputeHook {
|
||||
config: Config,
|
||||
state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
|
||||
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
|
||||
authorization_header: Option<String>,
|
||||
|
||||
// This lock is only used in testing enviroments, to serialize calls into neon_lock
|
||||
neon_local_lock: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
impl ComputeHook {
|
||||
@@ -207,7 +182,6 @@ impl ComputeHook {
|
||||
state: Default::default(),
|
||||
config,
|
||||
authorization_header,
|
||||
neon_local_lock: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -216,10 +190,6 @@ impl ComputeHook {
|
||||
&self,
|
||||
reconfigure_request: ComputeHookNotifyRequest,
|
||||
) -> anyhow::Result<()> {
|
||||
// neon_local updates are not safe to call concurrently, use a lock to serialize
|
||||
// all calls to this function
|
||||
let _locked = self.neon_local_lock.lock().await;
|
||||
|
||||
let env = match LocalEnv::load_config() {
|
||||
Ok(e) => e,
|
||||
Err(e) => {
|
||||
@@ -370,38 +340,30 @@ impl ComputeHook {
|
||||
stripe_size: ShardStripeSize,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), NotifyError> {
|
||||
let reconfigure_request = {
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
let mut locked = self.state.lock().await;
|
||||
|
||||
use std::collections::hash_map::Entry;
|
||||
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
|
||||
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
|
||||
tenant_shard_id,
|
||||
stripe_size,
|
||||
node_id,
|
||||
)),
|
||||
Entry::Occupied(e) => {
|
||||
let tenant = e.into_mut();
|
||||
tenant.update(tenant_shard_id, stripe_size, node_id);
|
||||
tenant
|
||||
}
|
||||
};
|
||||
|
||||
tenant.maybe_reconfigure(tenant_shard_id.tenant_id)
|
||||
use std::collections::hash_map::Entry;
|
||||
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
|
||||
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
|
||||
tenant_shard_id,
|
||||
stripe_size,
|
||||
node_id,
|
||||
)),
|
||||
Entry::Occupied(e) => {
|
||||
let tenant = e.into_mut();
|
||||
tenant.update(tenant_shard_id, stripe_size, node_id);
|
||||
tenant
|
||||
}
|
||||
};
|
||||
let Some((reconfigure_request, lock_fut)) = reconfigure_request else {
|
||||
|
||||
let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
|
||||
let Some(reconfigure_request) = reconfigure_request else {
|
||||
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
|
||||
// until it does.
|
||||
tracing::info!("Tenant isn't yet ready to emit a notification");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
// Finish acquiring the tenant's async lock: this future was created inside the self.state
|
||||
// lock above, so we are guaranteed to get this lock in the same order as callers took
|
||||
// that lock. This ordering is essential: the cloud control plane must end up with the
|
||||
// same end state for the tenant that we see.
|
||||
let _guard = lock_fut.await;
|
||||
|
||||
if let Some(notify_url) = &self.config.compute_hook_url {
|
||||
self.do_notify(notify_url, reconfigure_request, cancel)
|
||||
.await
|
||||
@@ -443,7 +405,6 @@ pub(crate) mod tests {
|
||||
tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.0
|
||||
.shards
|
||||
.len(),
|
||||
1
|
||||
@@ -451,7 +412,6 @@ pub(crate) mod tests {
|
||||
assert!(tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.0
|
||||
.stripe_size
|
||||
.is_none());
|
||||
|
||||
@@ -485,7 +445,6 @@ pub(crate) mod tests {
|
||||
tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.0
|
||||
.shards
|
||||
.len(),
|
||||
2
|
||||
@@ -494,7 +453,6 @@ pub(crate) mod tests {
|
||||
tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.0
|
||||
.stripe_size,
|
||||
Some(ShardStripeSize(32768))
|
||||
);
|
||||
|
||||
@@ -10,9 +10,7 @@ use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::auth::{Scope, SwappableJwtAuth};
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
|
||||
use utils::http::request::{must_get_query_param, parse_request_param};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -32,7 +30,7 @@ use pageserver_api::controller_api::{
|
||||
};
|
||||
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
|
||||
use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
@@ -440,24 +438,6 @@ async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiEr
|
||||
state.service.tenants_dump()
|
||||
}
|
||||
|
||||
async fn handle_balance_all(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
service.balance_all()?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_balance_attached(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
service.balance_attached()?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -574,9 +554,6 @@ pub fn make_router(
|
||||
.post("/debug/v1/consistency_check", |r| {
|
||||
request_span(r, handle_consistency_check)
|
||||
})
|
||||
.put("/debug/v1/failpoints", |r| {
|
||||
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
|
||||
})
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
@@ -595,12 +572,6 @@ pub fn make_router(
|
||||
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
||||
tenant_service_handler(r, handle_tenant_shard_split)
|
||||
})
|
||||
.post("/control/v1/balance/all", |r| {
|
||||
tenant_service_handler(r, handle_balance_all)
|
||||
})
|
||||
.post("/control/v1/balance/attached", |r| {
|
||||
tenant_service_handler(r, handle_balance_attached)
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
|
||||
/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
|
||||
/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
|
||||
/// is needed at a tenant-wide granularity.
|
||||
pub(crate) struct IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
/// A synchronous lock for getting/setting the async locks that our callers will wait on.
|
||||
entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
|
||||
}
|
||||
|
||||
impl<T> IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
pub(crate) fn shared(
|
||||
&self,
|
||||
key: T,
|
||||
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
let entry = locked.entry(key).or_default();
|
||||
entry.clone().read_owned()
|
||||
}
|
||||
|
||||
pub(crate) fn exclusive(
|
||||
&self,
|
||||
key: T,
|
||||
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
let entry = locked.entry(key).or_default();
|
||||
entry.clone().write_owned()
|
||||
}
|
||||
|
||||
/// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
|
||||
/// periodic housekeeping to avoid the map growing indefinitely
|
||||
pub(crate) fn housekeeping(&self) {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
locked.retain(|_k, lock| lock.try_write().is_err())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Default for IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
entities: std::sync::Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,6 @@ use utils::seqwait::MonotonicCounter;
|
||||
mod auth;
|
||||
mod compute_hook;
|
||||
pub mod http;
|
||||
mod id_lock_map;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
pub mod persistence;
|
||||
|
||||
@@ -1,9 +1,3 @@
|
||||
/// The attachment service mimics the aspects of the control plane API
|
||||
/// that are required for a pageserver to operate.
|
||||
///
|
||||
/// This enables running & testing pageservers without a full-blown
|
||||
/// deployment of the Neon cloud platform.
|
||||
///
|
||||
use anyhow::{anyhow, Context};
|
||||
use attachment_service::http::make_router;
|
||||
use attachment_service::metrics::preinitialize_metrics;
|
||||
|
||||
@@ -11,9 +11,6 @@ use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::ShardConfigError;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::generation::Generation;
|
||||
@@ -23,7 +20,7 @@ use crate::node::Node;
|
||||
|
||||
/// ## What do we store?
|
||||
///
|
||||
/// The attachment service does not store most of its state durably.
|
||||
/// The storage controller service does not store most of its state durably.
|
||||
///
|
||||
/// The essential things to store durably are:
|
||||
/// - generation numbers, as these must always advance monotonically to ensure data safety.
|
||||
@@ -37,7 +34,7 @@ use crate::node::Node;
|
||||
///
|
||||
/// ## Performance/efficiency
|
||||
///
|
||||
/// The attachment service does not go via the database for most things: there are
|
||||
/// The storage controller service does not go via the database for most things: there are
|
||||
/// a couple of places where we must, and where efficiency matters:
|
||||
/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
|
||||
/// before it can attach a tenant, so this acts as a bound on how fast things like
|
||||
@@ -75,14 +72,6 @@ pub(crate) enum DatabaseError {
|
||||
Logical(String),
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub(crate) enum AbortShardSplitStatus {
|
||||
/// We aborted the split in the database by reverting to the parent shards
|
||||
Aborted,
|
||||
/// The split had already been persisted.
|
||||
Complete,
|
||||
}
|
||||
|
||||
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
||||
|
||||
impl Persistence {
|
||||
@@ -581,42 +570,6 @@ impl Persistence {
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Used when the remote part of a shard split failed: we will revert the database state to have only
|
||||
/// the parent shards, with SplitState::Idle.
|
||||
pub(crate) async fn abort_shard_split(
|
||||
&self,
|
||||
split_tenant_id: TenantId,
|
||||
new_shard_count: ShardCount,
|
||||
) -> DatabaseResult<AbortShardSplitStatus> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
let aborted = conn.transaction(|conn| -> QueryResult<AbortShardSplitStatus> {
|
||||
// Clear the splitting state on parent shards
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
|
||||
// Parent shards are already gone: we cannot abort.
|
||||
if updated == 0 {
|
||||
return Ok(AbortShardSplitStatus::Complete);
|
||||
}
|
||||
|
||||
// Erase child shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(AbortShardSplitStatus::Aborted)
|
||||
})?;
|
||||
|
||||
Ok(aborted)
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||
@@ -651,28 +604,6 @@ pub(crate) struct TenantShardPersistence {
|
||||
pub(crate) config: String,
|
||||
}
|
||||
|
||||
impl TenantShardPersistence {
|
||||
pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
|
||||
if self.shard_count == 0 {
|
||||
Ok(ShardIdentity::unsharded())
|
||||
} else {
|
||||
Ok(ShardIdentity::new(
|
||||
ShardNumber(self.shard_number as u8),
|
||||
ShardCount::new(self.shard_count as u8),
|
||||
ShardStripeSize(self.shard_stripe_size as u32),
|
||||
)?)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
|
||||
Ok(TenantShardId {
|
||||
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(self.shard_number as u8),
|
||||
shard_count: ShardCount::new(self.shard_count as u8),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::node::Node`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
|
||||
#[diesel(table_name = crate::schema::nodes)]
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use crate::persistence::Persistence;
|
||||
use crate::service;
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
@@ -19,8 +18,6 @@ use crate::compute_hook::{ComputeHook, NotifyError};
|
||||
use crate::node::Node;
|
||||
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
|
||||
|
||||
const DEFAULT_HEATMAP_PERIOD: &str = "60s";
|
||||
|
||||
/// Object with the lifetime of the background reconcile task that is created
|
||||
/// for tenants which have a difference between their intent and observed states.
|
||||
pub(super) struct Reconciler {
|
||||
@@ -488,29 +485,17 @@ impl Reconciler {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(Ok(observed)) => Some(observed),
|
||||
Some(Err(mgmt_api::Error::ApiError(status, _msg)))
|
||||
if status == StatusCode::NOT_FOUND =>
|
||||
{
|
||||
None
|
||||
}
|
||||
Some(Ok(observed)) => observed,
|
||||
Some(Err(e)) => return Err(e.into()),
|
||||
None => return Err(ReconcileError::Cancel),
|
||||
};
|
||||
tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
|
||||
match observed_conf {
|
||||
Some(conf) => {
|
||||
// Pageserver returned a state: update it in observed. This may still be an indeterminate (None) state,
|
||||
// if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
|
||||
self.observed
|
||||
.locations
|
||||
.insert(attached_node.get_id(), ObservedStateLocation { conf });
|
||||
}
|
||||
None => {
|
||||
// Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
|
||||
self.observed.locations.remove(&attached_node.get_id());
|
||||
}
|
||||
}
|
||||
self.observed.locations.insert(
|
||||
attached_node.get_id(),
|
||||
ObservedStateLocation {
|
||||
conf: observed_conf,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -540,12 +525,7 @@ impl Reconciler {
|
||||
)));
|
||||
};
|
||||
|
||||
let mut wanted_conf = attached_location_conf(
|
||||
generation,
|
||||
&self.shard,
|
||||
&self.config,
|
||||
!self.intent.secondary.is_empty(),
|
||||
);
|
||||
let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node.get_id()) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
@@ -682,26 +662,10 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
|
||||
/// We tweak the externally-set TenantConfig while configuring
|
||||
/// locations, using our awareness of whether secondary locations
|
||||
/// are in use to automatically enable/disable heatmap uploads.
|
||||
fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
|
||||
let mut config = config.clone();
|
||||
if has_secondaries {
|
||||
if config.heatmap_period.is_none() {
|
||||
config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
|
||||
}
|
||||
} else {
|
||||
config.heatmap_period = None;
|
||||
}
|
||||
config
|
||||
}
|
||||
|
||||
pub(crate) fn attached_location_conf(
|
||||
generation: Generation,
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
has_secondaries: bool,
|
||||
) -> LocationConfig {
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::AttachedSingle,
|
||||
@@ -710,7 +674,7 @@ pub(crate) fn attached_location_conf(
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: ha_aware_config(config, has_secondaries),
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -725,6 +689,6 @@ pub(crate) fn secondary_location_conf(
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: ha_aware_config(config, true),
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -577,12 +577,7 @@ impl TenantState {
|
||||
.generation
|
||||
.expect("Attempted to enter attached state without a generation");
|
||||
|
||||
let wanted_conf = attached_location_conf(
|
||||
generation,
|
||||
&self.shard,
|
||||
&self.config,
|
||||
!self.intent.secondary.is_empty(),
|
||||
);
|
||||
let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
@@ -622,7 +617,7 @@ impl TenantState {
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn maybe_reconcile(
|
||||
&mut self,
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
pageservers: &Arc<HashMap<NodeId, Node>>,
|
||||
compute_hook: &Arc<ComputeHook>,
|
||||
service_config: &service::Config,
|
||||
@@ -734,6 +729,7 @@ impl TenantState {
|
||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||
metrics::RECONCILER.spawned.inc();
|
||||
let result_tx = result_tx.clone();
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
|
||||
@@ -8,11 +8,11 @@
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::attachment_service::AttachmentService;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::{InitForceMode, LocalEnv};
|
||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::StorageController;
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
@@ -138,7 +138,7 @@ fn main() -> Result<()> {
|
||||
"start" => rt.block_on(handle_start_all(sub_args, &env)),
|
||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
|
||||
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
|
||||
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
|
||||
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
|
||||
"mappings" => handle_mappings(sub_args, &mut env),
|
||||
@@ -445,14 +445,14 @@ async fn handle_tenant(
|
||||
// If tenant ID was not specified, generate one
|
||||
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
|
||||
|
||||
// We must register the tenant with the attachment service, so
|
||||
// We must register the tenant with the storage controller, so
|
||||
// that when the pageserver restarts, it will be re-attached.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_create(TenantCreateRequest {
|
||||
// Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
|
||||
// attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
|
||||
// type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
|
||||
// storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
|
||||
// type is used both in storage controller (for creating tenants) and in pageserver (for creating shards)
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters {
|
||||
@@ -476,9 +476,9 @@ async fn handle_tenant(
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
// FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
|
||||
// different shards picking different start lsns. Maybe we have to teach attachment service
|
||||
// different shards picking different start lsns. Maybe we have to teach storage controller
|
||||
// to let shard 0 branch first and then propagate the chosen LSN to other shards.
|
||||
attachment_service
|
||||
storage_controller
|
||||
.tenant_timeline_create(
|
||||
tenant_id,
|
||||
TimelineCreateRequest {
|
||||
@@ -528,8 +528,8 @@ async fn handle_tenant(
|
||||
let new_pageserver = get_pageserver(env, matches)?;
|
||||
let new_pageserver_id = new_pageserver.conf.id;
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_migrate(tenant_shard_id, new_pageserver_id)
|
||||
.await?;
|
||||
|
||||
@@ -543,8 +543,8 @@ async fn handle_tenant(
|
||||
|
||||
let mut tenant_synthetic_size = None;
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
|
||||
|
||||
@@ -585,10 +585,14 @@ async fn handle_tenant(
|
||||
Some(("shard-split", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
|
||||
let shard_stripe_size: Option<ShardStripeSize> = matches
|
||||
.get_one::<Option<ShardStripeSize>>("shard-stripe-size")
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let result = attachment_service
|
||||
.tenant_split(tenant_id, shard_count)
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let result = storage_controller
|
||||
.tenant_split(tenant_id, shard_count, shard_stripe_size)
|
||||
.await?;
|
||||
println!(
|
||||
"Split tenant {} into shards {}",
|
||||
@@ -613,7 +617,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
|
||||
match timeline_match.subcommand() {
|
||||
Some(("list", list_match)) => {
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
|
||||
// where shard 0 is attached, and query there.
|
||||
let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
|
||||
let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
|
||||
@@ -633,7 +637,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
||||
let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: None,
|
||||
@@ -641,7 +645,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
ancestor_start_lsn: None,
|
||||
pg_version: Some(pg_version),
|
||||
};
|
||||
let timeline_info = attachment_service
|
||||
let timeline_info = storage_controller
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?;
|
||||
|
||||
@@ -730,7 +734,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
.transpose()
|
||||
.context("Failed to parse ancestor start Lsn from the request")?;
|
||||
let new_timeline_id = TimelineId::generate();
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: Some(ancestor_timeline_id),
|
||||
@@ -738,7 +742,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
ancestor_start_lsn: start_lsn,
|
||||
pg_version: None,
|
||||
};
|
||||
let timeline_info = attachment_service
|
||||
let timeline_info = storage_controller
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?;
|
||||
|
||||
@@ -767,7 +771,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
|
||||
match sub_name {
|
||||
"list" => {
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
|
||||
// where shard 0 is attached, and query there.
|
||||
let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
|
||||
let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
|
||||
@@ -952,21 +956,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
(
|
||||
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// full managed by attachment service, therefore not sharded.
|
||||
// full managed by storage controller, therefore not sharded.
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
let pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported bad hostname"),
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
@@ -1015,8 +1019,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
pageserver.pg_connection_config.port(),
|
||||
)]
|
||||
} else {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_locate(endpoint.tenant_id)
|
||||
.await?
|
||||
.shards
|
||||
@@ -1024,7 +1028,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported malformed host"),
|
||||
.expect("Storage controller reported malformed host"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
@@ -1100,9 +1104,8 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||
.start(&pageserver_config_overrides(subcommand_args), *register)
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
@@ -1131,7 +1134,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(subcommand_args), false)
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
@@ -1144,8 +1147,8 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
let scheduling = subcommand_args.get_one("scheduling");
|
||||
let availability = subcommand_args.get_one("availability");
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.node_configure(NodeConfigureRequest {
|
||||
node_id: pageserver.conf.id,
|
||||
scheduling: scheduling.cloned(),
|
||||
@@ -1170,11 +1173,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_attachment_service(
|
||||
async fn handle_storage_controller(
|
||||
sub_match: &ArgMatches,
|
||||
env: &local_env::LocalEnv,
|
||||
) -> Result<()> {
|
||||
let svc = AttachmentService::from_env(env);
|
||||
let svc = StorageController::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", _start_match)) => {
|
||||
if let Err(e) = svc.start().await {
|
||||
@@ -1194,8 +1197,8 @@ async fn handle_attachment_service(
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
|
||||
None => bail!("no attachment_service subcommand provided"),
|
||||
Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name),
|
||||
None => bail!("no storage_controller subcommand provided"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1280,11 +1283,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
|
||||
broker::start_broker_process(env).await?;
|
||||
|
||||
// Only start the attachment service if the pageserver is configured to need it
|
||||
// Only start the storage controller if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.start().await {
|
||||
eprintln!("attachment_service start failed: {:#}", e);
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.start().await {
|
||||
eprintln!("storage_controller start failed: {:#}", e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
}
|
||||
@@ -1293,7 +1296,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(sub_match), true)
|
||||
.start(&pageserver_config_overrides(sub_match))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||
@@ -1356,9 +1359,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
}
|
||||
|
||||
if env.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.stop(immediate).await {
|
||||
eprintln!("attachment service stop failed: {e:#}");
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.stop(immediate).await {
|
||||
eprintln!("storage controller stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1586,6 +1589,7 @@ fn cli() -> Command {
|
||||
.about("Increase the number of shards in the tenant")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
||||
.arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
@@ -1596,11 +1600,7 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("status"))
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local pageserver")
|
||||
.arg(pageserver_config_args.clone()).arg(Arg::new("register")
|
||||
.long("register")
|
||||
.default_value("true").required(false)
|
||||
.value_parser(value_parser!(bool))
|
||||
.value_name("register"))
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local pageserver")
|
||||
@@ -1618,9 +1618,9 @@ fn cli() -> Command {
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("attachment_service")
|
||||
Command::new("storage_controller")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage attachment_service")
|
||||
.about("Manage storage_controller")
|
||||
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
||||
.subcommand(Command::new("stop").about("Stop local pageserver")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
|
||||
@@ -57,9 +57,9 @@ use serde::{Deserialize, Serialize};
|
||||
use url::Host;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::attachment_service::AttachmentService;
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage_controller::StorageController;
|
||||
|
||||
use compute_api::responses::{ComputeState, ComputeStatus};
|
||||
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
|
||||
@@ -750,17 +750,17 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
// If we weren't given explicit pageservers, query the attachment service
|
||||
// If we weren't given explicit pageservers, query the storage controller
|
||||
if pageservers.is_empty() {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
|
||||
let storage_controller = StorageController::from_env(&self.env);
|
||||
let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
|
||||
pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported bad hostname"),
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
//! local installations.
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
pub mod attachment_service;
|
||||
mod background_process;
|
||||
pub mod broker;
|
||||
pub mod endpoint;
|
||||
@@ -14,3 +13,4 @@ pub mod local_env;
|
||||
pub mod pageserver;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
pub mod storage_controller;
|
||||
|
||||
@@ -72,13 +72,13 @@ pub struct LocalEnv {
|
||||
#[serde(default)]
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
|
||||
// Control plane upcall API for pageserver: if None, we will not run attachment_service. If set, this will
|
||||
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
|
||||
// be propagated into each pageserver's configuration.
|
||||
#[serde(default)]
|
||||
pub control_plane_api: Option<Url>,
|
||||
|
||||
// Control plane upcall API for attachment service. If set, this will be propagated into the
|
||||
// attachment service's configuration.
|
||||
// Control plane upcall API for storage controller. If set, this will be propagated into the
|
||||
// storage controller's configuration.
|
||||
#[serde(default)]
|
||||
pub control_plane_compute_hook_api: Option<Url>,
|
||||
|
||||
@@ -227,10 +227,10 @@ impl LocalEnv {
|
||||
self.neon_distrib_dir.join("pageserver")
|
||||
}
|
||||
|
||||
pub fn attachment_service_bin(&self) -> PathBuf {
|
||||
// Irrespective of configuration, attachment service binary is always
|
||||
pub fn storage_controller_bin(&self) -> PathBuf {
|
||||
// Irrespective of configuration, storage controller binary is always
|
||||
// run from the same location as neon_local. This means that for compatibility
|
||||
// tests that run old pageserver/safekeeper, they still run latest attachment service.
|
||||
// tests that run old pageserver/safekeeper, they still run latest storage controller.
|
||||
let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
|
||||
neon_local_bin_dir.join("storage_controller")
|
||||
}
|
||||
|
||||
@@ -17,8 +17,6 @@ use std::time::Duration;
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::controller_api::NodeRegisterRequest;
|
||||
use pageserver_api::models::{
|
||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
};
|
||||
@@ -32,7 +30,6 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::attachment_service::AttachmentService;
|
||||
use crate::local_env::PageServerConf;
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
|
||||
@@ -112,7 +109,7 @@ impl PageServerNode {
|
||||
control_plane_api.as_str()
|
||||
));
|
||||
|
||||
// Attachment service uses the same auth as pageserver: if JWT is enabled
|
||||
// Storage controller uses the same auth as pageserver: if JWT is enabled
|
||||
// for us, we will also need it to talk to them.
|
||||
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
|
||||
let jwt_token = self
|
||||
@@ -164,8 +161,8 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides, false, register).await
|
||||
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides, false).await
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
@@ -203,6 +200,28 @@ impl PageServerNode {
|
||||
String::from_utf8_lossy(&init_output.stderr),
|
||||
);
|
||||
|
||||
// Write metadata file, used by pageserver on startup to register itself with
|
||||
// the storage controller
|
||||
let metadata_path = datadir.join("metadata.json");
|
||||
|
||||
let (_http_host, http_port) =
|
||||
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
||||
let http_port = http_port.unwrap_or(9898);
|
||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||
// in case the pageserver-side structure is edited, and reflects the real life
|
||||
// situation: the metadata is written by some other script.
|
||||
std::fs::write(
|
||||
metadata_path,
|
||||
serde_json::to_vec(&serde_json::json!({
|
||||
"host": "localhost",
|
||||
"port": self.pg_connection_config.port(),
|
||||
"http_host": "localhost",
|
||||
"http_port": http_port,
|
||||
}))
|
||||
.unwrap(),
|
||||
)
|
||||
.expect("Failed to write metadata file");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -210,27 +229,7 @@ impl PageServerNode {
|
||||
&self,
|
||||
config_overrides: &[&str],
|
||||
update_config: bool,
|
||||
register: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// Register the node with the storage controller before starting pageserver: pageserver must be registered to
|
||||
// successfully call /re-attach and finish starting up.
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
@@ -263,11 +262,6 @@ impl PageServerNode {
|
||||
match st {
|
||||
Ok(()) => Ok(true),
|
||||
Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
|
||||
Err(mgmt_api::Error::ApiError(status, _msg))
|
||||
if status == StatusCode::SERVICE_UNAVAILABLE =>
|
||||
{
|
||||
Ok(false)
|
||||
}
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
}
|
||||
},
|
||||
|
||||
@@ -10,7 +10,7 @@ use pageserver_api::{
|
||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -24,7 +24,7 @@ use utils::{
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
|
||||
pub struct AttachmentService {
|
||||
pub struct StorageController {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: Utf8PathBuf,
|
||||
@@ -36,7 +36,7 @@ pub struct AttachmentService {
|
||||
|
||||
const COMMAND: &str = "storage_controller";
|
||||
|
||||
const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
|
||||
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
@@ -59,7 +59,7 @@ pub struct InspectResponse {
|
||||
pub attachment: Option<(u32, NodeId)>,
|
||||
}
|
||||
|
||||
impl AttachmentService {
|
||||
impl StorageController {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
@@ -136,27 +136,27 @@ impl AttachmentService {
|
||||
}
|
||||
|
||||
fn pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
|
||||
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// PIDFile for the postgres instance used to store attachment service state
|
||||
/// PIDFile for the postgres instance used to store storage controller state
|
||||
fn postgres_pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(
|
||||
self.env
|
||||
.base_data_dir
|
||||
.join("attachment_service_postgres.pid"),
|
||||
.join("storage_controller_postgres.pid"),
|
||||
)
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
||||
///
|
||||
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
|
||||
/// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
|
||||
/// to other versions if that one isn't found. Some automated tests create circumstances
|
||||
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
|
||||
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
|
||||
|
||||
for v in prefer_versions {
|
||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
|
||||
@@ -189,7 +189,7 @@ impl AttachmentService {
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
const DB_NAME: &str = "attachment_service";
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
@@ -219,10 +219,10 @@ impl AttachmentService {
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the attachment service for persistence.
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("attachment_service_db");
|
||||
.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
@@ -245,7 +245,7 @@ impl AttachmentService {
|
||||
.await?;
|
||||
};
|
||||
|
||||
println!("Starting attachment service database...");
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
@@ -256,7 +256,7 @@ impl AttachmentService {
|
||||
];
|
||||
|
||||
background_process::start_process(
|
||||
"attachment_service_db",
|
||||
"storage_controller_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
@@ -300,7 +300,7 @@ impl AttachmentService {
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
&self.env.attachment_service_bin(),
|
||||
&self.env.storage_controller_bin(),
|
||||
args,
|
||||
[(
|
||||
"NEON_REPO_DIR".to_string(),
|
||||
@@ -322,10 +322,10 @@ impl AttachmentService {
|
||||
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||
|
||||
let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
println!("Stopping attachment service database...");
|
||||
println!("Stopping storage controller database...");
|
||||
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
|
||||
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_stop_args)
|
||||
@@ -344,10 +344,10 @@ impl AttachmentService {
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||
println!("Attachment service data base is already stopped");
|
||||
println!("Storage controller database is already stopped");
|
||||
return Ok(());
|
||||
} else {
|
||||
anyhow::bail!("Failed to stop attachment service database: {stop_status}")
|
||||
anyhow::bail!("Failed to stop storage controller database: {stop_status}")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -368,7 +368,7 @@ impl AttachmentService {
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into attachment service
|
||||
/// Simple HTTP request wrapper for calling into storage controller
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: hyper::Method,
|
||||
@@ -496,11 +496,15 @@ impl AttachmentService {
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
new_shard_count: u8,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<TenantShardSplitResponse> {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
||||
Some(TenantShardSplitRequest { new_shard_count }),
|
||||
Some(TenantShardSplitRequest {
|
||||
new_shard_count,
|
||||
new_stripe_size,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -70,9 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
|
||||
Should only be used e.g. for status check.
|
||||
Currently also used for connection from any pageserver to any safekeeper.
|
||||
|
||||
"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
|
||||
"generations_api": Provides access to the upcall APIs served by the storage controller or the control plane.
|
||||
|
||||
"admin": Provides access to the control plane and admin APIs of the attachment service.
|
||||
"admin": Provides access to the control plane and admin APIs of the storage controller.
|
||||
|
||||
### CLI
|
||||
CLI generates a key pair during call to `neon_local init` with the following commands:
|
||||
|
||||
@@ -29,7 +29,6 @@ pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
mod hll;
|
||||
pub mod metric_vec_duration;
|
||||
pub use hll::{HyperLogLog, HyperLogLogVec};
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod more_process_metrics;
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
|
||||
|
||||
use std::{future::Future, time::Instant};
|
||||
|
||||
pub trait DurationResultObserver {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
|
||||
}
|
||||
|
||||
pub async fn observe_async_block_duration_by_result<
|
||||
T,
|
||||
E,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
O: DurationResultObserver,
|
||||
>(
|
||||
observer: &O,
|
||||
block: F,
|
||||
) -> Result<T, E> {
|
||||
let start = Instant::now();
|
||||
let result = block.await;
|
||||
let duration = start.elapsed();
|
||||
observer.observe_result(&result, duration);
|
||||
result
|
||||
}
|
||||
@@ -88,8 +88,6 @@ impl FromStr for NodeAvailability {
|
||||
}
|
||||
}
|
||||
|
||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
||||
/// type needs to be defined with diesel traits in there.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
|
||||
@@ -198,6 +198,13 @@ pub struct TimelineCreateRequest {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantShardSplitRequest {
|
||||
pub new_shard_count: u8,
|
||||
|
||||
// A tenant's stripe size is only meaningful the first time their shard count goes
|
||||
// above 1: therefore during a split from 1->N shards, we may modify the stripe size.
|
||||
//
|
||||
// If this is set while the stripe count is being increased from an already >1 value,
|
||||
// then the request will fail with 400.
|
||||
pub new_stripe_size: Option<ShardStripeSize>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
|
||||
@@ -6,11 +6,18 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::shard::TenantShardId;
|
||||
use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
|
||||
|
||||
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
|
||||
/// startup.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
/// Optional inline self-registration: this is useful with the storage controller,
|
||||
/// if the node already has a node_id set.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub register: Option<NodeRegisterRequest>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
|
||||
@@ -17,6 +17,7 @@ use remote_storage::{
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
|
||||
@@ -484,32 +485,33 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
|
||||
{
|
||||
let mut stream = ctx
|
||||
let stream = ctx
|
||||
.client
|
||||
.download(&path, &cancel)
|
||||
.await
|
||||
.expect("download succeeds")
|
||||
.download_stream;
|
||||
|
||||
let first = stream
|
||||
.next()
|
||||
.await
|
||||
.expect("should have the first blob")
|
||||
.expect("should have succeeded");
|
||||
let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream));
|
||||
|
||||
tracing::info!(len = first.len(), "downloaded first chunk");
|
||||
let first = reader.fill_buf().await.expect("should have the first blob");
|
||||
|
||||
let len = first.len();
|
||||
tracing::info!(len, "downloaded first chunk");
|
||||
|
||||
assert!(
|
||||
first.len() < len,
|
||||
first.len() < file_len,
|
||||
"uploaded file is too small, we downloaded all on first chunk"
|
||||
);
|
||||
|
||||
reader.consume(len);
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
let next = stream.next().await.expect("stream should have more");
|
||||
let next = reader.fill_buf().await;
|
||||
|
||||
let e = next.expect_err("expected an error, but got a chunk?");
|
||||
|
||||
@@ -520,6 +522,10 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
||||
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
||||
"{inner:?}"
|
||||
);
|
||||
|
||||
let e = DownloadError::from(e);
|
||||
|
||||
assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
@@ -123,6 +123,12 @@ impl PageserverFeedback {
|
||||
rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||
}
|
||||
}
|
||||
b"shard_number" => {
|
||||
let len = buf.get_i32();
|
||||
// TODO: this will be implemented in the next update,
|
||||
// for now, we just skip the value.
|
||||
buf.advance(len as usize);
|
||||
}
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
warn!(
|
||||
|
||||
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
|
||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
|
||||
(*api).process_safekeeper_feedback(&mut (*wp))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -142,7 +142,7 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
|
||||
fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
|
||||
@@ -7,8 +7,9 @@
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use serde;
|
||||
use serde::de::IntoDeserializer;
|
||||
use std::env;
|
||||
use std::{collections::HashMap, env};
|
||||
use storage_broker::Uri;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::ConnectionId;
|
||||
@@ -83,6 +84,10 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
|
||||
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
||||
|
||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
||||
@@ -300,6 +305,26 @@ impl<T> BuilderValue<T> {
|
||||
}
|
||||
}
|
||||
|
||||
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
||||
// as a separate structure. This information is not neeed by the pageserver
|
||||
// itself, it is only used for registering the pageserver with the control
|
||||
// plane and/or storage controller.
|
||||
//
|
||||
#[derive(serde::Deserialize)]
|
||||
pub(crate) struct NodeMetadata {
|
||||
#[serde(rename = "host")]
|
||||
pub(crate) postgres_host: String,
|
||||
#[serde(rename = "port")]
|
||||
pub(crate) postgres_port: u16,
|
||||
pub(crate) http_host: String,
|
||||
pub(crate) http_port: u16,
|
||||
|
||||
// Deployment tools may write fields to the metadata file beyond what we
|
||||
// use in this type: this type intentionally only names fields that require.
|
||||
#[serde(flatten)]
|
||||
pub(crate) other: HashMap<String, serde_json::Value>,
|
||||
}
|
||||
|
||||
// needed to simplify config construction
|
||||
struct PageServerConfigBuilder {
|
||||
listen_pg_addr: BuilderValue<String>,
|
||||
@@ -757,6 +782,10 @@ impl PageServerConf {
|
||||
self.workdir.join("deletion")
|
||||
}
|
||||
|
||||
pub fn metadata_path(&self) -> Utf8PathBuf {
|
||||
self.workdir.join("metadata.json")
|
||||
}
|
||||
|
||||
pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
|
||||
// Encode a version in the filename, so that if we ever switch away from JSON we can
|
||||
// increment this.
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::collections::HashMap;
|
||||
|
||||
use futures::Future;
|
||||
use pageserver_api::{
|
||||
controller_api::NodeRegisterRequest,
|
||||
shard::TenantShardId,
|
||||
upcall_api::{
|
||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||
@@ -12,7 +13,10 @@ use tokio_util::sync::CancellationToken;
|
||||
use url::Url;
|
||||
use utils::{backoff, generation::Generation, id::NodeId};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::{
|
||||
config::{NodeMetadata, PageServerConf},
|
||||
virtual_file::on_fatal_io_error,
|
||||
};
|
||||
|
||||
/// The Pageserver's client for using the control plane API: this is a small subset
|
||||
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
||||
@@ -32,6 +36,7 @@ pub enum RetryForeverError {
|
||||
pub trait ControlPlaneGenerationsApi {
|
||||
fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
|
||||
fn validate(
|
||||
&self,
|
||||
@@ -110,13 +115,59 @@ impl ControlPlaneClient {
|
||||
|
||||
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
/// Block until we get a successful response, or error out if we are shut down
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
async fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
let re_attach_path = self
|
||||
.base_url
|
||||
.join("re-attach")
|
||||
.expect("Failed to build re-attach path");
|
||||
|
||||
// Include registration content in the re-attach request if a metadata file is readable
|
||||
let metadata_path = conf.metadata_path();
|
||||
let register = match tokio::fs::read_to_string(&metadata_path).await {
|
||||
Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
|
||||
Ok(m) => {
|
||||
// Since we run one time at startup, be generous in our logging and
|
||||
// dump all metadata.
|
||||
tracing::info!(
|
||||
"Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
|
||||
m.postgres_host,
|
||||
m.postgres_port,
|
||||
m.http_host,
|
||||
m.http_port,
|
||||
m.other
|
||||
);
|
||||
|
||||
Some(NodeRegisterRequest {
|
||||
node_id: conf.id,
|
||||
listen_pg_addr: m.postgres_host,
|
||||
listen_pg_port: m.postgres_port,
|
||||
listen_http_addr: m.http_host,
|
||||
listen_http_port: m.http_port,
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Unreadable metadata in {metadata_path}: {e}");
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// This is legal: we may have been deployed with some external script
|
||||
// doing registration for us.
|
||||
tracing::info!("Metadata file not found at {metadata_path}");
|
||||
} else {
|
||||
on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}"))
|
||||
}
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let request = ReAttachRequest {
|
||||
node_id: self.node_id,
|
||||
register,
|
||||
};
|
||||
|
||||
fail::fail_point!("control-plane-client-re-attach");
|
||||
|
||||
@@ -831,7 +831,10 @@ mod test {
|
||||
}
|
||||
|
||||
impl ControlPlaneGenerationsApi for MockControlPlane {
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
async fn re_attach(
|
||||
&self,
|
||||
_conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
unimplemented!()
|
||||
}
|
||||
async fn validate(
|
||||
|
||||
@@ -932,6 +932,59 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/heatmap_upload:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
If the location is in an attached mode, upload the current state to the remote heatmap
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/secondary/download:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
If the location is in secondary mode, download latest heatmap and layers
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
parameters:
|
||||
@@ -1391,7 +1444,7 @@ components:
|
||||
trace_read_requests:
|
||||
type: boolean
|
||||
heatmap_period:
|
||||
type: integer
|
||||
type: string
|
||||
TenantConfigResponse:
|
||||
type: object
|
||||
properties:
|
||||
|
||||
@@ -1151,7 +1151,12 @@ async fn tenant_shard_split_handler(
|
||||
|
||||
let new_shards = state
|
||||
.tenant_manager
|
||||
.shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
|
||||
.shard_split(
|
||||
tenant_shard_id,
|
||||
ShardCount::new(req.new_shard_count),
|
||||
req.new_stripe_size,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -2103,16 +2108,6 @@ where
|
||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
|
||||
{
|
||||
if request.uri() != &"/v1/failpoints".parse::<Uri>().unwrap() {
|
||||
fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable(
|
||||
"failpoint".into()
|
||||
)));
|
||||
|
||||
fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError(
|
||||
anyhow::anyhow!("failpoint")
|
||||
)));
|
||||
}
|
||||
|
||||
// Spawn a new task to handle the request, to protect the handler from unexpected
|
||||
// async cancellations. Most pageserver functions are not async cancellation safe.
|
||||
// We arm a drop-guard, so that if Hyper drops the Future, we signal the task
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use enum_map::EnumMap;
|
||||
use metrics::metric_vec_duration::DurationResultObserver;
|
||||
use metrics::{
|
||||
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
||||
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
|
||||
@@ -1283,11 +1282,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
|
||||
})
|
||||
});
|
||||
|
||||
impl DurationResultObserver for BasebackupQueryTime {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
|
||||
pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
parent: &'a BasebackupQueryTime,
|
||||
ctx: &'c RequestContext,
|
||||
start: std::time::Instant,
|
||||
}
|
||||
|
||||
impl BasebackupQueryTime {
|
||||
pub(crate) fn start_recording<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
ctx: &'c RequestContext,
|
||||
) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
|
||||
let start = Instant::now();
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
}
|
||||
BasebackupQueryTimeOngoingRecording {
|
||||
parent: self,
|
||||
ctx,
|
||||
start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = self
|
||||
.ctx
|
||||
.micros_spent_throttled
|
||||
.close_and_checked_sub_from(elapsed);
|
||||
let ex_throttled = match ex_throttled {
|
||||
Ok(ex_throttled) => ex_throttled,
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
let label_value = if res.is_ok() { "ok" } else { "error" };
|
||||
let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
|
||||
metric.observe(duration.as_secs_f64());
|
||||
let metric = self
|
||||
.parent
|
||||
.0
|
||||
.get_metric_with_label_values(&[label_value])
|
||||
.unwrap();
|
||||
metric.observe(ex_throttled.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1964,10 +2017,8 @@ impl TimelineMetrics {
|
||||
pub(crate) fn resident_physical_size_get(&self) -> u64 {
|
||||
self.resident_physical_size_gauge.get()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TimelineMetrics {
|
||||
fn drop(&mut self) {
|
||||
pub(crate) fn shutdown(&self) {
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
@@ -2623,6 +2674,12 @@ pub fn preinitialize_metrics() {
|
||||
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
|
||||
Lazy::force(&disk_usage_based_eviction::METRICS);
|
||||
|
||||
for state_name in pageserver_api::models::TenantState::VARIANTS {
|
||||
// initialize the metric for all gauges, otherwise the time series might seemingly show
|
||||
// values from last restart.
|
||||
TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
|
||||
}
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
|
||||
@@ -1199,7 +1199,7 @@ impl PageServerHandler {
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
gzip: bool,
|
||||
ctx: RequestContext,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -1214,7 +1214,7 @@ impl PageServerHandler {
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
info!("waiting for {}", lsn);
|
||||
timeline.wait_lsn(lsn, &ctx).await?;
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||
.context("invalid basebackup lsn")?;
|
||||
@@ -1236,7 +1236,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
@@ -1257,7 +1257,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
// shutdown the encoder to ensure the gzip footer is written
|
||||
@@ -1269,7 +1269,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -1449,25 +1449,25 @@ where
|
||||
false
|
||||
};
|
||||
|
||||
::metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*metrics::BASEBACKUP_QUERY_TIME,
|
||||
async move {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
let res = async {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
}
|
||||
.await;
|
||||
metric_recording.observe(&res);
|
||||
res?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||
@@ -1563,7 +1563,7 @@ where
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
|
||||
@@ -272,9 +272,6 @@ pub enum TaskKind {
|
||||
// Task that uploads a file to remote storage
|
||||
RemoteUploadTask,
|
||||
|
||||
// Task that downloads a file from remote storage
|
||||
RemoteDownloadTask,
|
||||
|
||||
// task that handles the initial downloading of all tenants
|
||||
InitialLoad,
|
||||
|
||||
|
||||
@@ -1846,6 +1846,8 @@ impl Tenant {
|
||||
// Wait for any in-flight operations to complete
|
||||
self.gate.close().await;
|
||||
|
||||
remove_tenant_metrics(&self.tenant_shard_id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3557,11 +3559,6 @@ async fn run_initdb(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl Drop for Tenant {
|
||||
fn drop(&mut self) {
|
||||
remove_tenant_metrics(&self.tenant_shard_id);
|
||||
}
|
||||
}
|
||||
/// Dump contents of a layer file to stdout.
|
||||
pub async fn dump_layerfile_from_path(
|
||||
path: &Utf8Path,
|
||||
@@ -4628,10 +4625,7 @@ mod tests {
|
||||
drop(guard);
|
||||
|
||||
// Pick a big LSN such that we query over all the changes.
|
||||
// Technically, u64::MAX - 1 is the largest LSN supported by the read path,
|
||||
// but there seems to be a bug on the non-vectored search path which surfaces
|
||||
// in that case.
|
||||
let reads_lsn = Lsn(u64::MAX - 1000);
|
||||
let reads_lsn = Lsn(u64::MAX - 1);
|
||||
|
||||
for read in reads {
|
||||
info!("Doing vectored read on {:?}", read);
|
||||
@@ -5148,4 +5142,23 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_at_max_lsn() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_read_at_max_lsn")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let lsn = Lsn(0x10);
|
||||
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
|
||||
|
||||
let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let read_lsn = Lsn(u64::MAX - 1);
|
||||
|
||||
assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -354,6 +354,7 @@ pub struct TenantConf {
|
||||
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
|
||||
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
||||
/// locations will use the heatmap uploaded by attached locations.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub heatmap_period: Duration,
|
||||
|
||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||
|
||||
@@ -6,7 +6,9 @@ use futures::stream::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
@@ -295,7 +297,7 @@ async fn init_load_generations(
|
||||
} else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
|
||||
info!("Calling control plane API to re-attach tenants");
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
match client.re_attach().await {
|
||||
match client.re_attach(conf).await {
|
||||
Ok(tenants) => tenants,
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
anyhow::bail!("Shut down while waiting for control plane re-attach response")
|
||||
@@ -1439,36 +1441,12 @@ impl TenantManager {
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let r = self
|
||||
.do_shard_split(tenant_shard_id, new_shard_count, ctx)
|
||||
.await;
|
||||
if r.is_err() {
|
||||
// Shard splitting might have left the original shard in a partially shut down state (it
|
||||
// stops the shard's remote timeline client). Reset it to ensure we leave things in
|
||||
// a working state.
|
||||
if self.get(tenant_shard_id).is_some() {
|
||||
tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
|
||||
if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
|
||||
// Log this error because our return value will still be the original error, not this one.
|
||||
tracing::warn!("Failed to reset {tenant_shard_id}: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
r
|
||||
}
|
||||
|
||||
pub(crate) async fn do_shard_split(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
// Plan: identify what the new child shards will be
|
||||
// Validate the incoming request
|
||||
if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
|
||||
anyhow::bail!("Requested shard count is not an increase");
|
||||
}
|
||||
@@ -1477,10 +1455,18 @@ impl TenantManager {
|
||||
anyhow::bail!("Requested split is not a power of two");
|
||||
}
|
||||
|
||||
let parent_shard_identity = tenant.shard_identity;
|
||||
let parent_tenant_conf = tenant.get_tenant_conf();
|
||||
let parent_generation = tenant.generation;
|
||||
if let Some(new_stripe_size) = new_stripe_size {
|
||||
if tenant.get_shard_stripe_size() != new_stripe_size
|
||||
&& tenant_shard_id.shard_count.count() > 1
|
||||
{
|
||||
// This tenant already has multiple shards, it is illegal to try and change its stripe size
|
||||
anyhow::bail!(
|
||||
"Shard stripe size may not be modified once tenant has multiple shards"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Plan: identify what the new child shards will be
|
||||
let child_shards = tenant_shard_id.split(new_shard_count);
|
||||
tracing::info!(
|
||||
"Shard {} splits into: {}",
|
||||
@@ -1491,9 +1477,9 @@ impl TenantManager {
|
||||
.join(",")
|
||||
);
|
||||
|
||||
fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
let parent_shard_identity = tenant.shard_identity;
|
||||
let parent_tenant_conf = tenant.get_tenant_conf();
|
||||
let parent_generation = tenant.generation;
|
||||
|
||||
// Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
|
||||
if let Err(e) = tenant.split_prepare(&child_shards).await {
|
||||
@@ -1504,10 +1490,6 @@ impl TenantManager {
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
self.resources.deletion_queue_client.flush_advisory();
|
||||
|
||||
// Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
|
||||
@@ -1529,16 +1511,11 @@ impl TenantManager {
|
||||
anyhow::bail!("Detached parent shard in the middle of split!")
|
||||
}
|
||||
};
|
||||
fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
// Optimization: hardlink layers from the parent into the children, so that they don't have to
|
||||
// re-download & duplicate the data referenced in their initial IndexPart
|
||||
self.shard_split_hardlink(parent, child_shards.clone())
|
||||
.await?;
|
||||
fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
// Take a snapshot of where the parent's WAL ingest had got to: we will wait for
|
||||
// child shards to reach this point.
|
||||
@@ -1553,6 +1530,9 @@ impl TenantManager {
|
||||
// Phase 3: Spawn the child shards
|
||||
for child_shard in &child_shards {
|
||||
let mut child_shard_identity = parent_shard_identity;
|
||||
if let Some(new_stripe_size) = new_stripe_size {
|
||||
child_shard_identity.stripe_size = new_stripe_size;
|
||||
}
|
||||
child_shard_identity.count = child_shard.shard_count;
|
||||
child_shard_identity.number = child_shard.shard_number;
|
||||
|
||||
@@ -1575,10 +1555,6 @@ impl TenantManager {
|
||||
.await?;
|
||||
}
|
||||
|
||||
fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
// Phase 4: wait for child chards WAL ingest to catch up to target LSN
|
||||
for child_shard_id in &child_shards {
|
||||
let child_shard_id = *child_shard_id;
|
||||
@@ -1611,10 +1587,6 @@ impl TenantManager {
|
||||
timeline.timeline_id,
|
||||
target_lsn
|
||||
);
|
||||
|
||||
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
|
||||
// Failure here might mean shutdown, in any case this part is an optimization
|
||||
// and we shouldn't hold up the split operation.
|
||||
@@ -1660,10 +1632,6 @@ impl TenantManager {
|
||||
},
|
||||
);
|
||||
|
||||
fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
parent_slot_guard.drop_old_value()?;
|
||||
|
||||
// Phase 6: Release the InProgress on the parent shard
|
||||
|
||||
@@ -536,6 +536,18 @@ impl Drop for LayerInner {
|
||||
// carry this until we are finished for [`Layer::wait_drop`] support
|
||||
let _status = status;
|
||||
|
||||
let Some(timeline) = timeline.upgrade() else {
|
||||
// no need to nag that timeline is gone: under normal situation on
|
||||
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
|
||||
return;
|
||||
};
|
||||
|
||||
let Ok(_guard) = timeline.gate.enter() else {
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
|
||||
return;
|
||||
};
|
||||
|
||||
let removed = match std::fs::remove_file(path) {
|
||||
Ok(()) => true,
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
@@ -554,32 +566,26 @@ impl Drop for LayerInner {
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(timeline) = timeline.upgrade() {
|
||||
if removed {
|
||||
timeline.metrics.resident_physical_size_sub(file_size);
|
||||
}
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
|
||||
if removed {
|
||||
timeline.metrics.resident_physical_size_sub(file_size);
|
||||
}
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
|
||||
|
||||
if let Err(e) = res {
|
||||
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
|
||||
// demonstrating this deadlock (without spawn_blocking): stop will drop
|
||||
// queued items, which will have ResidentLayer's, and those drops would try
|
||||
// to re-entrantly lock the RemoteTimelineClient inner state.
|
||||
if !timeline.is_active() {
|
||||
tracing::info!("scheduling deletion on drop failed: {e:#}");
|
||||
} else {
|
||||
tracing::warn!("scheduling deletion on drop failed: {e:#}");
|
||||
}
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
|
||||
if let Err(e) = res {
|
||||
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
|
||||
// demonstrating this deadlock (without spawn_blocking): stop will drop
|
||||
// queued items, which will have ResidentLayer's, and those drops would try
|
||||
// to re-entrantly lock the RemoteTimelineClient inner state.
|
||||
if !timeline.is_active() {
|
||||
tracing::info!("scheduling deletion on drop failed: {e:#}");
|
||||
} else {
|
||||
LAYER_IMPL_METRICS.inc_completed_deletes();
|
||||
tracing::warn!("scheduling deletion on drop failed: {e:#}");
|
||||
}
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
|
||||
} else {
|
||||
LAYER_IMPL_METRICS.inc_completed_deletes();
|
||||
}
|
||||
} else {
|
||||
// no need to nag that timeline is gone: under normal situation on
|
||||
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -880,23 +886,18 @@ impl LayerInner {
|
||||
) -> Result<heavier_once_cell::InitPermit, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let task_name = format!("download layer {}", self);
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
// this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
|
||||
// block tenant::mgr::remove_tenant_from_memory.
|
||||
|
||||
let this: Arc<Self> = self.clone();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
crate::task_mgr::TaskKind::RemoteDownloadTask,
|
||||
Some(self.desc.tenant_shard_id),
|
||||
Some(self.desc.timeline_id),
|
||||
&task_name,
|
||||
false,
|
||||
async move {
|
||||
let guard = timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|_| DownloadError::DownloadCancelled)?;
|
||||
|
||||
tokio::task::spawn(async move {
|
||||
|
||||
let _guard = guard;
|
||||
|
||||
let client = timeline
|
||||
.remote_client
|
||||
@@ -906,7 +907,7 @@ impl LayerInner {
|
||||
let result = client.download_layer_file(
|
||||
&this.desc.filename(),
|
||||
&this.metadata(),
|
||||
&crate::task_mgr::shutdown_token()
|
||||
&timeline.cancel
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -929,7 +930,6 @@ impl LayerInner {
|
||||
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(backoff) => {},
|
||||
_ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
|
||||
_ = timeline.cancel.cancelled() => {},
|
||||
};
|
||||
|
||||
@@ -959,11 +959,10 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
match rx.await {
|
||||
Ok((Ok(()), permit)) => {
|
||||
if let Some(reason) = self
|
||||
@@ -1102,6 +1101,10 @@ impl LayerInner {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
};
|
||||
|
||||
let Ok(_gate) = timeline.gate.enter() else {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
};
|
||||
|
||||
// to avoid starting a new download while we evict, keep holding on to the
|
||||
// permit.
|
||||
let _permit = {
|
||||
|
||||
@@ -1257,6 +1257,8 @@ impl Timeline {
|
||||
|
||||
// Finally wait until any gate-holders are complete
|
||||
self.gate.close().await;
|
||||
|
||||
self.metrics.shutdown();
|
||||
}
|
||||
|
||||
pub(crate) fn set_state(&self, new_state: TimelineState) {
|
||||
@@ -2476,7 +2478,7 @@ impl Timeline {
|
||||
// 'prev_lsn' tracks the last LSN that we were at in our search. It's used
|
||||
// to check that each iteration make some progress, to break infinite
|
||||
// looping if something goes wrong.
|
||||
let mut prev_lsn = Lsn(u64::MAX);
|
||||
let mut prev_lsn = None;
|
||||
|
||||
let mut result = ValueReconstructResult::Continue;
|
||||
let mut cont_lsn = Lsn(request_lsn.0 + 1);
|
||||
@@ -2496,18 +2498,20 @@ impl Timeline {
|
||||
MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
|
||||
return Ok(traversal_path);
|
||||
}
|
||||
if prev_lsn <= cont_lsn {
|
||||
// Didn't make any progress in last iteration. Error out to avoid
|
||||
// getting stuck in the loop.
|
||||
return Err(layer_traversal_error(format!(
|
||||
"could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
|
||||
key,
|
||||
Lsn(cont_lsn.0 - 1),
|
||||
request_lsn,
|
||||
timeline.ancestor_lsn
|
||||
), traversal_path));
|
||||
if let Some(prev) = prev_lsn {
|
||||
if prev <= cont_lsn {
|
||||
// Didn't make any progress in last iteration. Error out to avoid
|
||||
// getting stuck in the loop.
|
||||
return Err(layer_traversal_error(format!(
|
||||
"could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
|
||||
key,
|
||||
Lsn(cont_lsn.0 - 1),
|
||||
request_lsn,
|
||||
timeline.ancestor_lsn
|
||||
), traversal_path));
|
||||
}
|
||||
}
|
||||
prev_lsn = cont_lsn;
|
||||
prev_lsn = Some(cont_lsn);
|
||||
}
|
||||
ValueReconstructResult::Missing => {
|
||||
return Err(layer_traversal_error(
|
||||
@@ -2537,7 +2541,7 @@ impl Timeline {
|
||||
|
||||
timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
|
||||
timeline = &*timeline_owned;
|
||||
prev_lsn = Lsn(u64::MAX);
|
||||
prev_lsn = None;
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
|
||||
@@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested)
|
||||
struct sysinfo si;
|
||||
Size total;
|
||||
if (sysinfo(&si) < 0)
|
||||
elog(ERROR, "Failed to get amount of RAM: %n");
|
||||
elog(ERROR, "Failed to get amount of RAM: %m");
|
||||
|
||||
total = si.totalram*si.mem_unit;
|
||||
if ((Size)NBuffers*BLCKSZ + requested >= total)
|
||||
|
||||
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
|
||||
SHLIB_LINK = -lcurl
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
EXTRA_CLEAN = \
|
||||
|
||||
@@ -38,7 +38,6 @@ neon_download_extension_file_http(const char *filename, bool is_library)
|
||||
|
||||
CURLcode res;
|
||||
char *compute_ctl_url;
|
||||
char *postdata;
|
||||
bool ret = false;
|
||||
|
||||
if (handle == NULL)
|
||||
|
||||
@@ -316,6 +316,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
TimestampTz now;
|
||||
uint64_t us_since_last_connect;
|
||||
bool broke_from_loop = false;
|
||||
|
||||
Assert(page_servers[shard_no].conn == NULL);
|
||||
|
||||
@@ -418,7 +419,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
|
||||
neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
return false;
|
||||
/* Returning from inside PG_TRY is bad, so we break/return later */
|
||||
broke_from_loop = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -431,6 +434,11 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
if (broke_from_loop)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
|
||||
page_servers[shard_no].conn = conn;
|
||||
page_servers[shard_no].wes = wes;
|
||||
|
||||
6
pgxn/neon/neon--1.1--1.0.sql
Normal file
6
pgxn/neon/neon--1.1--1.0.sql
Normal file
@@ -0,0 +1,6 @@
|
||||
-- the order of operations is important here
|
||||
-- because the view depends on the function
|
||||
|
||||
DROP VIEW IF EXISTS neon_lfc_stats CASCADE;
|
||||
|
||||
DROP FUNCTION IF EXISTS neon_get_lfc_stats CASCADE;
|
||||
1
pgxn/neon/neon--1.2--1.1.sql
Normal file
1
pgxn/neon/neon--1.2--1.1.sql
Normal file
@@ -0,0 +1 @@
|
||||
DROP VIEW IF EXISTS NEON_STAT_FILE_CACHE CASCADE;
|
||||
1
pgxn/neon/neon--1.3--1.2.sql
Normal file
1
pgxn/neon/neon--1.3--1.2.sql
Normal file
@@ -0,0 +1 @@
|
||||
DROP FUNCTION IF EXISTS approximate_working_set_size(bool) CASCADE;
|
||||
@@ -95,7 +95,6 @@ get_num_snap_files_lsn_threshold(void)
|
||||
DIR *dirdesc;
|
||||
struct dirent *de;
|
||||
char *snap_path = "pg_logical/snapshots/";
|
||||
int cnt = 0;
|
||||
int lsns_allocated = 1024;
|
||||
int lsns_num = 0;
|
||||
XLogRecPtr *lsns;
|
||||
@@ -161,9 +160,6 @@ get_num_snap_files_lsn_threshold(void)
|
||||
PGDLLEXPORT void
|
||||
LogicalSlotsMonitorMain(Datum main_arg)
|
||||
{
|
||||
TimestampTz now,
|
||||
last_checked;
|
||||
|
||||
/* Establish signal handlers. */
|
||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
|
||||
@@ -1888,7 +1888,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
int nblocks, bool skipFsync)
|
||||
{
|
||||
const PGAlignedBlock buffer = {0};
|
||||
BlockNumber curblocknum = blocknum;
|
||||
int remblocks = nblocks;
|
||||
XLogRecPtr lsn = 0;
|
||||
|
||||
|
||||
@@ -1220,7 +1220,7 @@ PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr begin
|
||||
req->epochStartLsn = wp->propEpochStartLsn;
|
||||
req->beginLsn = beginLsn;
|
||||
req->endLsn = endLsn;
|
||||
req->commitLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
req->commitLsn = wp->commitLsn;
|
||||
req->truncateLsn = wp->truncateLsn;
|
||||
req->proposerId = wp->greetRequest.proposerId;
|
||||
}
|
||||
@@ -1405,7 +1405,7 @@ static bool
|
||||
RecvAppendResponses(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
XLogRecPtr minQuorumLsn;
|
||||
XLogRecPtr newCommitLsn;
|
||||
bool readAnything = false;
|
||||
|
||||
while (true)
|
||||
@@ -1444,18 +1444,19 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
if (!readAnything)
|
||||
return sk->state == SS_ACTIVE;
|
||||
|
||||
HandleSafekeeperResponse(wp);
|
||||
|
||||
/* update commit_lsn */
|
||||
newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
/*
|
||||
* Also send the new commit lsn to all the safekeepers.
|
||||
* Send the new value to all safekeepers.
|
||||
*/
|
||||
minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
if (minQuorumLsn > wp->lastSentCommitLsn)
|
||||
if (newCommitLsn > wp->commitLsn)
|
||||
{
|
||||
wp->commitLsn = newCommitLsn;
|
||||
BroadcastAppendRequest(wp);
|
||||
wp->lastSentCommitLsn = minQuorumLsn;
|
||||
}
|
||||
|
||||
HandleSafekeeperResponse(wp);
|
||||
|
||||
return sk->state == SS_ACTIVE;
|
||||
}
|
||||
|
||||
@@ -1632,11 +1633,9 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
|
||||
static void
|
||||
HandleSafekeeperResponse(WalProposer *wp)
|
||||
{
|
||||
XLogRecPtr minQuorumLsn;
|
||||
XLogRecPtr candidateTruncateLsn;
|
||||
|
||||
minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
wp->api.process_safekeeper_feedback(wp, minQuorumLsn);
|
||||
wp->api.process_safekeeper_feedback(wp);
|
||||
|
||||
/*
|
||||
* Try to advance truncateLsn -- the last record flushed to all
|
||||
@@ -1649,7 +1648,7 @@ HandleSafekeeperResponse(WalProposer *wp)
|
||||
* can't commit entries from previous term' in Raft); 2)
|
||||
*/
|
||||
candidateTruncateLsn = CalculateMinFlushLsn(wp);
|
||||
candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn);
|
||||
candidateTruncateLsn = Min(candidateTruncateLsn, wp->commitLsn);
|
||||
if (candidateTruncateLsn > wp->truncateLsn)
|
||||
{
|
||||
wp->truncateLsn = candidateTruncateLsn;
|
||||
|
||||
@@ -564,7 +564,7 @@ typedef struct walproposer_api
|
||||
* backpressure feedback and to confirm WAL persistence (has been commited
|
||||
* on the quorum of safekeepers).
|
||||
*/
|
||||
void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
|
||||
void (*process_safekeeper_feedback) (WalProposer *wp);
|
||||
|
||||
/*
|
||||
* Write a log message to the internal log processor. This is used only
|
||||
@@ -646,8 +646,8 @@ typedef struct WalProposer
|
||||
/* WAL has been generated up to this point */
|
||||
XLogRecPtr availableLsn;
|
||||
|
||||
/* last commitLsn broadcasted to safekeepers */
|
||||
XLogRecPtr lastSentCommitLsn;
|
||||
/* cached GetAcknowledgedByQuorumWALPosition result */
|
||||
XLogRecPtr commitLsn;
|
||||
|
||||
ProposerGreeting greetRequest;
|
||||
|
||||
|
||||
@@ -68,6 +68,8 @@ static WalproposerShmemState *walprop_shared;
|
||||
static WalProposerConfig walprop_config;
|
||||
static XLogRecPtr sentPtr = InvalidXLogRecPtr;
|
||||
static const walproposer_api walprop_pg;
|
||||
static volatile sig_atomic_t got_SIGUSR2 = false;
|
||||
static bool reported_sigusr2 = false;
|
||||
|
||||
static void nwp_shmem_startup_hook(void);
|
||||
static void nwp_register_gucs(void);
|
||||
@@ -101,6 +103,8 @@ static void add_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void update_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
|
||||
|
||||
static void CheckGracefulShutdown(WalProposer *wp);
|
||||
|
||||
static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
|
||||
|
||||
static void
|
||||
@@ -492,6 +496,24 @@ walprop_pg_init_standalone_sync_safekeepers(void)
|
||||
BackgroundWorkerUnblockSignals();
|
||||
}
|
||||
|
||||
/*
|
||||
* We pretend to be a walsender process, and the lifecycle of a walsender is
|
||||
* slightly different than other procesess. At shutdown, walsender processes
|
||||
* stay alive until the very end, after the checkpointer has written the
|
||||
* shutdown checkpoint. When the checkpointer exits, the postmaster sends all
|
||||
* remaining walsender processes SIGUSR2. On receiving SIGUSR2, we try to send
|
||||
* the remaining WAL, and then exit. This ensures that the checkpoint record
|
||||
* reaches durable storage (in safekeepers), before the server shuts down
|
||||
* completely.
|
||||
*/
|
||||
static void
|
||||
walprop_sigusr2(SIGNAL_ARGS)
|
||||
{
|
||||
got_SIGUSR2 = true;
|
||||
|
||||
SetLatch(MyLatch);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_init_bgworker(void)
|
||||
{
|
||||
@@ -503,6 +525,7 @@ walprop_pg_init_bgworker(void)
|
||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
pqsignal(SIGTERM, die);
|
||||
pqsignal(SIGUSR2, walprop_sigusr2);
|
||||
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
@@ -1026,7 +1049,7 @@ static void
|
||||
StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
|
||||
{
|
||||
XLogRecPtr FlushPtr;
|
||||
TimeLineID currTLI;
|
||||
__attribute__((unused)) TimeLineID currTLI;
|
||||
|
||||
#if PG_VERSION_NUM < 150000
|
||||
if (ThisTimeLineID == 0)
|
||||
@@ -1075,14 +1098,26 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When we first start replication the standby will be behind the primary.
|
||||
* For some applications, for example synchronous replication, it is
|
||||
* important to have a clear state for this initial catchup mode, so we
|
||||
* can trigger actions when we change streaming state later. We may stay
|
||||
* in this state for a long time, which is exactly why we want to be able
|
||||
* to monitor whether or not we are still here.
|
||||
* XXX: Move straight to STOPPING state, skipping the STREAMING state.
|
||||
*
|
||||
* This is a bit weird. Normal walsenders stay in STREAMING state, until
|
||||
* the checkpointer signals them that it is about to start writing the
|
||||
* shutdown checkpoint. The walsenders acknowledge that they have received
|
||||
* that signal by switching to STOPPING state. That tells the walsenders
|
||||
* that they must not write any new WAL.
|
||||
*
|
||||
* However, we cannot easily intercept that signal from the checkpointer.
|
||||
* It's sent by WalSndInitStopping(), using
|
||||
* SendProcSignal(PROCSIGNAL_WALSND_INIT_STOPPING). It's received by
|
||||
* HandleWalSndInitStopping, which sets a process-local got_STOPPING flag.
|
||||
* However, that's all private to walsender.c.
|
||||
*
|
||||
* We don't need to do anything special upon receiving the signal, the
|
||||
* walproposer doesn't write any WAL anyway, so we skip the STREAMING
|
||||
* state and go directly to STOPPING mode. That way, the checkpointer
|
||||
* won't wait for us.
|
||||
*/
|
||||
WalSndSetState(WALSNDSTATE_CATCHUP);
|
||||
WalSndSetState(WALSNDSTATE_STOPPING);
|
||||
|
||||
/*
|
||||
* Don't allow a request to stream from a future point in WAL that hasn't
|
||||
@@ -1122,6 +1157,8 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
|
||||
static void
|
||||
WalSndLoop(WalProposer *wp)
|
||||
{
|
||||
XLogRecPtr flushPtr;
|
||||
|
||||
/* Clear any already-pending wakeups */
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
@@ -1130,9 +1167,6 @@ WalSndLoop(WalProposer *wp)
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
XLogBroadcastWalProposer(wp);
|
||||
|
||||
if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
|
||||
WalSndSetState(WALSNDSTATE_STREAMING);
|
||||
WalProposerPoll(wp);
|
||||
}
|
||||
}
|
||||
@@ -1230,7 +1264,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
|
||||
TimeLineID timeline;
|
||||
XLogRecPtr startpos;
|
||||
XLogRecPtr endpos;
|
||||
uint64 download_range_mb;
|
||||
|
||||
startpos = GetLogRepRestartLSN(wp);
|
||||
if (startpos == InvalidXLogRecPtr)
|
||||
@@ -1745,6 +1778,9 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
|
||||
{
|
||||
ConditionVariableCancelSleep();
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CheckGracefulShutdown(wp);
|
||||
|
||||
*events = WL_LATCH_SET;
|
||||
return 1;
|
||||
}
|
||||
@@ -1798,6 +1834,41 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Like vanilla walsender, on sigusr2 send all remaining WAL and exit.
|
||||
*
|
||||
* Note that unlike sync-safekeepers waiting here is not reliable: we
|
||||
* don't check that majority of safekeepers received and persisted
|
||||
* commit_lsn -- only that walproposer reached it (which immediately
|
||||
* broadcasts new value). Doing that without incurring redundant control
|
||||
* file syncing would need wp -> sk protocol change. OTOH unlike
|
||||
* sync-safekeepers which must bump commit_lsn or basebackup will fail,
|
||||
* this catchup is important only for tests where safekeepers/network
|
||||
* don't crash on their own.
|
||||
*/
|
||||
static void
|
||||
CheckGracefulShutdown(WalProposer *wp)
|
||||
{
|
||||
if (got_SIGUSR2)
|
||||
{
|
||||
if (!reported_sigusr2)
|
||||
{
|
||||
XLogRecPtr flushPtr = walprop_pg_get_flush_rec_ptr(wp);
|
||||
|
||||
wpg_log(LOG, "walproposer will send and wait for remaining WAL between %X/%X and %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->commitLsn), LSN_FORMAT_ARGS(flushPtr));
|
||||
reported_sigusr2 = true;
|
||||
}
|
||||
|
||||
if (wp->commitLsn >= walprop_pg_get_flush_rec_ptr(wp))
|
||||
{
|
||||
wpg_log(LOG, "walproposer sent all WAL up to %X/%X, exiting",
|
||||
LSN_FORMAT_ARGS(wp->commitLsn));
|
||||
proc_exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Choose most advanced PageserverFeedback and set it to *rf.
|
||||
*/
|
||||
@@ -1878,7 +1949,7 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
|
||||
* None of that is functional in sync-safekeepers.
|
||||
*/
|
||||
static void
|
||||
walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
walprop_pg_process_safekeeper_feedback(WalProposer *wp)
|
||||
{
|
||||
HotStandbyFeedback hsFeedback;
|
||||
XLogRecPtr oldDiskConsistentLsn;
|
||||
@@ -1893,10 +1964,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
replication_feedback_set(&quorumFeedback.rf);
|
||||
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
|
||||
|
||||
if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
|
||||
if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
|
||||
{
|
||||
if (commitLsn > quorumFeedback.flushLsn)
|
||||
quorumFeedback.flushLsn = commitLsn;
|
||||
if (wp->commitLsn > quorumFeedback.flushLsn)
|
||||
quorumFeedback.flushLsn = wp->commitLsn;
|
||||
|
||||
/*
|
||||
* Advance the replication slot to commitLsn. WAL before it is
|
||||
@@ -1929,6 +2000,8 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
XidFromFullTransactionId(hsFeedback.catalog_xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
|
||||
}
|
||||
|
||||
CheckGracefulShutdown(wp);
|
||||
}
|
||||
|
||||
static XLogRecPtr
|
||||
|
||||
@@ -182,8 +182,6 @@ test_consume_memory(PG_FUNCTION_ARGS)
|
||||
Datum
|
||||
test_release_memory(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TimestampTz start;
|
||||
|
||||
if (PG_ARGISNULL(0))
|
||||
{
|
||||
if (consume_cxt)
|
||||
|
||||
@@ -220,6 +220,9 @@ enter_seccomp_mode(void)
|
||||
}
|
||||
#endif /* HAVE_LIBSECCOMP */
|
||||
|
||||
PGDLLEXPORT void
|
||||
WalRedoMain(int argc, char *argv[]);
|
||||
|
||||
/*
|
||||
* Entry point for the WAL redo process.
|
||||
*
|
||||
|
||||
@@ -73,7 +73,7 @@ pub mod errors {
|
||||
// Status 406: endpoint is disabled (we don't allow connections).
|
||||
format!("{REQUEST_FAILED}: endpoint is disabled")
|
||||
}
|
||||
http::StatusCode::LOCKED => {
|
||||
http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
|
||||
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
|
||||
format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
|
||||
}
|
||||
@@ -91,6 +91,12 @@ pub mod errors {
|
||||
status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
|
||||
..
|
||||
} => crate::error::ErrorKind::User,
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
text,
|
||||
} if text.contains("compute time quota of non-primary branches is exceeded") => {
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::LOCKED,
|
||||
text,
|
||||
@@ -120,6 +126,11 @@ pub mod errors {
|
||||
status: http::StatusCode::BAD_REQUEST,
|
||||
..
|
||||
} => true,
|
||||
// don't retry when quotas are exceeded
|
||||
Self::Console {
|
||||
status: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
ref text,
|
||||
} => !text.contains("compute time quota of non-primary branches is exceeded"),
|
||||
// locked can be returned when the endpoint was in transition
|
||||
// or when quotas are exceeded. don't retry when quotas are exceeded
|
||||
Self::Console {
|
||||
|
||||
@@ -17,7 +17,7 @@ use pin_project_lite::pin_project;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{metrics::NUM_CLIENT_CONNECTION_GAUGE, serverless::tls_listener::AsyncAccept};
|
||||
use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
|
||||
|
||||
pub struct ProxyProtocolAccept {
|
||||
pub incoming: AddrIncoming,
|
||||
@@ -331,15 +331,15 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncAccept for ProxyProtocolAccept {
|
||||
type Connection = WithConnectionGuard<WithClientIp<AddrStream>>;
|
||||
impl Accept for ProxyProtocolAccept {
|
||||
type Conn = WithConnectionGuard<WithClientIp<AddrStream>>;
|
||||
|
||||
type Error = io::Error;
|
||||
|
||||
fn poll_accept(
|
||||
mut self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
|
||||
) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
|
||||
let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
|
||||
tracing::info!(protocol = self.protocol, "accepted new TCP connection");
|
||||
let Some(conn) = conn else {
|
||||
|
||||
@@ -69,6 +69,12 @@ fn report_error(e: &WakeComputeError, retry: bool) {
|
||||
{
|
||||
"quota_exceeded"
|
||||
}
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::UNPROCESSABLE_ENTITY,
|
||||
ref text,
|
||||
}) if text.contains("compute time quota of non-primary branches is exceeded") => {
|
||||
"quota_exceeded"
|
||||
}
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::LOCKED,
|
||||
..
|
||||
|
||||
@@ -21,24 +21,19 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
use tokio_util::task::TaskTracker;
|
||||
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::metrics::TLS_HANDSHAKE_FAILURES;
|
||||
use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
use crate::serverless::backend::PoolingBackend;
|
||||
use crate::{cancellation::CancellationHandler, config::ProxyConfig};
|
||||
use futures::StreamExt;
|
||||
use hyper::{
|
||||
server::{
|
||||
accept,
|
||||
conn::{AddrIncoming, AddrStream},
|
||||
},
|
||||
server::conn::{AddrIncoming, AddrStream},
|
||||
Body, Method, Request, Response,
|
||||
};
|
||||
|
||||
use std::convert::Infallible;
|
||||
use std::net::IpAddr;
|
||||
use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
use std::{future::ready, sync::Arc};
|
||||
use tls_listener::TlsListener;
|
||||
use tokio::net::TcpListener;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -105,19 +100,12 @@ pub async fn task_main(
|
||||
let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
ws_connections.close(); // allows `ws_connections.wait to complete`
|
||||
|
||||
let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
|
||||
if let Err(err) = conn {
|
||||
error!(
|
||||
protocol = "http",
|
||||
"failed to accept TLS connection: {err:?}"
|
||||
);
|
||||
TLS_HANDSHAKE_FAILURES.inc();
|
||||
ready(false)
|
||||
} else {
|
||||
info!(protocol = "http", "accepted new TLS connection");
|
||||
ready(true)
|
||||
}
|
||||
});
|
||||
let tls_listener = TlsListener::new(
|
||||
tls_acceptor,
|
||||
addr_incoming,
|
||||
"http",
|
||||
config.handshake_timeout,
|
||||
);
|
||||
|
||||
let make_svc = hyper::service::make_service_fn(
|
||||
|stream: &tokio_rustls::server::TlsStream<
|
||||
@@ -174,7 +162,7 @@ pub async fn task_main(
|
||||
},
|
||||
);
|
||||
|
||||
hyper::Server::builder(accept::from_stream(tls_listener))
|
||||
hyper::Server::builder(tls_listener)
|
||||
.serve(make_svc)
|
||||
.with_graceful_shutdown(cancellation_token.cancelled())
|
||||
.await?;
|
||||
|
||||
@@ -12,6 +12,7 @@ use crate::{
|
||||
CachedNodeInfo,
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
error::{ErrorKind, ReportableError, UserFacingError},
|
||||
proxy::connect_compute::ConnectMechanism,
|
||||
};
|
||||
|
||||
@@ -117,6 +118,30 @@ pub enum HttpConnError {
|
||||
WakeCompute(#[from] WakeComputeError),
|
||||
}
|
||||
|
||||
impl ReportableError for HttpConnError {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
match self {
|
||||
HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
|
||||
HttpConnError::ConnectionError(p) => p.get_error_kind(),
|
||||
HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
|
||||
HttpConnError::AuthError(a) => a.get_error_kind(),
|
||||
HttpConnError::WakeCompute(w) => w.get_error_kind(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for HttpConnError {
|
||||
fn to_string_client(&self) -> String {
|
||||
match self {
|
||||
HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(),
|
||||
HttpConnError::ConnectionError(p) => p.to_string(),
|
||||
HttpConnError::GetAuthInfo(c) => c.to_string_client(),
|
||||
HttpConnError::AuthError(c) => c.to_string_client(),
|
||||
HttpConnError::WakeCompute(c) => c.to_string_client(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct TokioMechanism {
|
||||
pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
|
||||
conn_info: ConnInfo,
|
||||
|
||||
@@ -119,16 +119,12 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
|
||||
}
|
||||
}
|
||||
|
||||
fn put(
|
||||
pool: &RwLock<Self>,
|
||||
conn_info: &ConnInfo,
|
||||
client: ClientInner<C>,
|
||||
) -> anyhow::Result<()> {
|
||||
fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
|
||||
let conn_id = client.conn_id;
|
||||
|
||||
if client.is_closed() {
|
||||
info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
|
||||
return Ok(());
|
||||
return;
|
||||
}
|
||||
let global_max_conn = pool.read().global_pool_size_max_conns;
|
||||
if pool
|
||||
@@ -138,7 +134,7 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
|
||||
>= global_max_conn
|
||||
{
|
||||
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
|
||||
return Ok(());
|
||||
return;
|
||||
}
|
||||
|
||||
// return connection to the pool
|
||||
@@ -172,8 +168,6 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
|
||||
} else {
|
||||
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -612,13 +606,6 @@ impl<C: ClientInnerExt> Client<C> {
|
||||
let inner = inner.as_mut().expect("client inner should not be removed");
|
||||
(&mut inner.inner, Discard { pool, conn_info })
|
||||
}
|
||||
|
||||
pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
|
||||
self.inner().1.check_idle(status)
|
||||
}
|
||||
pub fn discard(&mut self) {
|
||||
self.inner().1.discard()
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: ClientInnerExt> Discard<'_, C> {
|
||||
@@ -660,7 +647,7 @@ impl<C: ClientInnerExt> Client<C> {
|
||||
// return connection to the pool
|
||||
return Some(move || {
|
||||
let _span = current_span.enter();
|
||||
let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
|
||||
EndpointConnPool::put(&conn_pool, &conn_info, client);
|
||||
});
|
||||
}
|
||||
None
|
||||
@@ -739,7 +726,7 @@ mod tests {
|
||||
{
|
||||
let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
|
||||
assert_eq!(0, pool.get_global_connections_count());
|
||||
client.discard();
|
||||
client.inner().1.discard();
|
||||
// Discard should not add the connection from the pool.
|
||||
assert_eq!(0, pool.get_global_connections_count());
|
||||
}
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
use std::pin::pin;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::bail;
|
||||
use futures::future::select;
|
||||
use futures::future::try_join;
|
||||
use futures::future::Either;
|
||||
use futures::StreamExt;
|
||||
use futures::TryFutureExt;
|
||||
use hyper::body::HttpBody;
|
||||
use hyper::header;
|
||||
use hyper::http::HeaderName;
|
||||
@@ -11,13 +15,16 @@ use hyper::StatusCode;
|
||||
use hyper::{Body, HeaderMap, Request};
|
||||
use serde_json::json;
|
||||
use serde_json::Value;
|
||||
use tokio::try_join;
|
||||
use tokio::time;
|
||||
use tokio_postgres::error::DbError;
|
||||
use tokio_postgres::error::ErrorPosition;
|
||||
use tokio_postgres::error::SqlState;
|
||||
use tokio_postgres::GenericClient;
|
||||
use tokio_postgres::IsolationLevel;
|
||||
use tokio_postgres::NoTls;
|
||||
use tokio_postgres::ReadyForQueryStatus;
|
||||
use tokio_postgres::Transaction;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::error;
|
||||
use tracing::info;
|
||||
use url::Url;
|
||||
@@ -30,9 +37,13 @@ use crate::auth::ComputeUserInfoParseError;
|
||||
use crate::config::ProxyConfig;
|
||||
use crate::config::TlsConfig;
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::error::ErrorKind;
|
||||
use crate::error::ReportableError;
|
||||
use crate::error::UserFacingError;
|
||||
use crate::metrics::HTTP_CONTENT_LENGTH;
|
||||
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::serverless::backend::HttpConnError;
|
||||
use crate::DbName;
|
||||
use crate::RoleName;
|
||||
|
||||
@@ -40,6 +51,7 @@ use super::backend::PoolingBackend;
|
||||
use super::conn_pool::ConnInfo;
|
||||
use super::json::json_to_pg_text;
|
||||
use super::json::pg_text_row_to_json;
|
||||
use super::json::JsonConversionError;
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
@@ -110,6 +122,18 @@ pub enum ConnInfoError {
|
||||
MalformedEndpoint,
|
||||
}
|
||||
|
||||
impl ReportableError for ConnInfoError {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
ErrorKind::User
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for ConnInfoError {
|
||||
fn to_string_client(&self) -> String {
|
||||
self.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn get_conn_info(
|
||||
ctx: &mut RequestMonitoring,
|
||||
headers: &HeaderMap,
|
||||
@@ -194,108 +218,123 @@ pub async fn handle(
|
||||
request: Request<Body>,
|
||||
backend: Arc<PoolingBackend>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let result = tokio::time::timeout(
|
||||
config.http_config.request_timeout,
|
||||
handle_inner(config, &mut ctx, request, backend),
|
||||
)
|
||||
.await;
|
||||
let cancel = CancellationToken::new();
|
||||
let cancel2 = cancel.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
time::sleep(config.http_config.request_timeout).await;
|
||||
cancel2.cancel();
|
||||
});
|
||||
|
||||
let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
|
||||
handle.abort();
|
||||
|
||||
let mut response = match result {
|
||||
Ok(r) => match r {
|
||||
Ok(r) => {
|
||||
ctx.set_success();
|
||||
r
|
||||
}
|
||||
Err(e) => {
|
||||
// TODO: ctx.set_error_kind(e.get_error_type());
|
||||
|
||||
let mut message = format!("{:?}", e);
|
||||
let db_error = e
|
||||
.downcast_ref::<tokio_postgres::Error>()
|
||||
.and_then(|e| e.as_db_error());
|
||||
fn get<'a, T: serde::Serialize>(
|
||||
db: Option<&'a DbError>,
|
||||
x: impl FnOnce(&'a DbError) -> T,
|
||||
) -> Value {
|
||||
db.map(x)
|
||||
.and_then(|t| serde_json::to_value(t).ok())
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
if let Some(db_error) = db_error {
|
||||
db_error.message().clone_into(&mut message);
|
||||
}
|
||||
|
||||
let position = db_error.and_then(|db| db.position());
|
||||
let (position, internal_position, internal_query) = match position {
|
||||
Some(ErrorPosition::Original(position)) => (
|
||||
Value::String(position.to_string()),
|
||||
Value::Null,
|
||||
Value::Null,
|
||||
),
|
||||
Some(ErrorPosition::Internal { position, query }) => (
|
||||
Value::Null,
|
||||
Value::String(position.to_string()),
|
||||
Value::String(query.clone()),
|
||||
),
|
||||
None => (Value::Null, Value::Null, Value::Null),
|
||||
};
|
||||
|
||||
let code = get(db_error, |db| db.code().code());
|
||||
let severity = get(db_error, |db| db.severity());
|
||||
let detail = get(db_error, |db| db.detail());
|
||||
let hint = get(db_error, |db| db.hint());
|
||||
let where_ = get(db_error, |db| db.where_());
|
||||
let table = get(db_error, |db| db.table());
|
||||
let column = get(db_error, |db| db.column());
|
||||
let schema = get(db_error, |db| db.schema());
|
||||
let datatype = get(db_error, |db| db.datatype());
|
||||
let constraint = get(db_error, |db| db.constraint());
|
||||
let file = get(db_error, |db| db.file());
|
||||
let line = get(db_error, |db| db.line().map(|l| l.to_string()));
|
||||
let routine = get(db_error, |db| db.routine());
|
||||
|
||||
error!(
|
||||
?code,
|
||||
"sql-over-http per-client task finished with an error: {e:#}"
|
||||
);
|
||||
// TODO: this shouldn't always be bad request.
|
||||
json_response(
|
||||
StatusCode::BAD_REQUEST,
|
||||
json!({
|
||||
"message": message,
|
||||
"code": code,
|
||||
"detail": detail,
|
||||
"hint": hint,
|
||||
"position": position,
|
||||
"internalPosition": internal_position,
|
||||
"internalQuery": internal_query,
|
||||
"severity": severity,
|
||||
"where": where_,
|
||||
"table": table,
|
||||
"column": column,
|
||||
"schema": schema,
|
||||
"dataType": datatype,
|
||||
"constraint": constraint,
|
||||
"file": file,
|
||||
"line": line,
|
||||
"routine": routine,
|
||||
}),
|
||||
)?
|
||||
}
|
||||
},
|
||||
Err(_) => {
|
||||
// TODO: when http error classification is done, distinguish between
|
||||
// timeout on sql vs timeout in proxy/cplane
|
||||
// ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
|
||||
Ok(r) => {
|
||||
ctx.set_success();
|
||||
r
|
||||
}
|
||||
Err(e @ SqlOverHttpError::Cancelled(_)) => {
|
||||
let error_kind = e.get_error_kind();
|
||||
ctx.set_error_kind(error_kind);
|
||||
|
||||
let message = format!(
|
||||
"HTTP-Connection timed out, execution time exceeded {} seconds",
|
||||
config.http_config.request_timeout.as_secs()
|
||||
"Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
|
||||
config.http_config.request_timeout.as_secs_f64()
|
||||
);
|
||||
error!(message);
|
||||
|
||||
tracing::info!(
|
||||
kind=error_kind.to_metric_label(),
|
||||
error=%e,
|
||||
msg=message,
|
||||
"forwarding error to user"
|
||||
);
|
||||
|
||||
json_response(
|
||||
StatusCode::GATEWAY_TIMEOUT,
|
||||
json!({ "message": message, "code": StatusCode::GATEWAY_TIMEOUT.as_u16() }),
|
||||
StatusCode::BAD_REQUEST,
|
||||
json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }),
|
||||
)?
|
||||
}
|
||||
Err(e) => {
|
||||
let error_kind = e.get_error_kind();
|
||||
ctx.set_error_kind(error_kind);
|
||||
|
||||
let mut message = e.to_string_client();
|
||||
let db_error = match &e {
|
||||
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
|
||||
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
|
||||
_ => None,
|
||||
};
|
||||
fn get<'a, T: serde::Serialize>(
|
||||
db: Option<&'a DbError>,
|
||||
x: impl FnOnce(&'a DbError) -> T,
|
||||
) -> Value {
|
||||
db.map(x)
|
||||
.and_then(|t| serde_json::to_value(t).ok())
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
if let Some(db_error) = db_error {
|
||||
db_error.message().clone_into(&mut message);
|
||||
}
|
||||
|
||||
let position = db_error.and_then(|db| db.position());
|
||||
let (position, internal_position, internal_query) = match position {
|
||||
Some(ErrorPosition::Original(position)) => (
|
||||
Value::String(position.to_string()),
|
||||
Value::Null,
|
||||
Value::Null,
|
||||
),
|
||||
Some(ErrorPosition::Internal { position, query }) => (
|
||||
Value::Null,
|
||||
Value::String(position.to_string()),
|
||||
Value::String(query.clone()),
|
||||
),
|
||||
None => (Value::Null, Value::Null, Value::Null),
|
||||
};
|
||||
|
||||
let code = get(db_error, |db| db.code().code());
|
||||
let severity = get(db_error, |db| db.severity());
|
||||
let detail = get(db_error, |db| db.detail());
|
||||
let hint = get(db_error, |db| db.hint());
|
||||
let where_ = get(db_error, |db| db.where_());
|
||||
let table = get(db_error, |db| db.table());
|
||||
let column = get(db_error, |db| db.column());
|
||||
let schema = get(db_error, |db| db.schema());
|
||||
let datatype = get(db_error, |db| db.datatype());
|
||||
let constraint = get(db_error, |db| db.constraint());
|
||||
let file = get(db_error, |db| db.file());
|
||||
let line = get(db_error, |db| db.line().map(|l| l.to_string()));
|
||||
let routine = get(db_error, |db| db.routine());
|
||||
|
||||
tracing::info!(
|
||||
kind=error_kind.to_metric_label(),
|
||||
error=%e,
|
||||
msg=message,
|
||||
"forwarding error to user"
|
||||
);
|
||||
|
||||
// TODO: this shouldn't always be bad request.
|
||||
json_response(
|
||||
StatusCode::BAD_REQUEST,
|
||||
json!({
|
||||
"message": message,
|
||||
"code": code,
|
||||
"detail": detail,
|
||||
"hint": hint,
|
||||
"position": position,
|
||||
"internalPosition": internal_position,
|
||||
"internalQuery": internal_query,
|
||||
"severity": severity,
|
||||
"where": where_,
|
||||
"table": table,
|
||||
"column": column,
|
||||
"schema": schema,
|
||||
"dataType": datatype,
|
||||
"constraint": constraint,
|
||||
"file": file,
|
||||
"line": line,
|
||||
"routine": routine,
|
||||
}),
|
||||
)?
|
||||
}
|
||||
};
|
||||
@@ -307,12 +346,101 @@ pub async fn handle(
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SqlOverHttpError {
|
||||
#[error("{0}")]
|
||||
ReadPayload(#[from] ReadPayloadError),
|
||||
#[error("{0}")]
|
||||
ConnectCompute(#[from] HttpConnError),
|
||||
#[error("{0}")]
|
||||
ConnInfo(#[from] ConnInfoError),
|
||||
#[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")]
|
||||
RequestTooLarge,
|
||||
#[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")]
|
||||
ResponseTooLarge,
|
||||
#[error("invalid isolation level")]
|
||||
InvalidIsolationLevel,
|
||||
#[error("{0}")]
|
||||
Postgres(#[from] tokio_postgres::Error),
|
||||
#[error("{0}")]
|
||||
JsonConversion(#[from] JsonConversionError),
|
||||
#[error("{0}")]
|
||||
Cancelled(SqlOverHttpCancel),
|
||||
}
|
||||
|
||||
impl ReportableError for SqlOverHttpError {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
match self {
|
||||
SqlOverHttpError::ReadPayload(e) => e.get_error_kind(),
|
||||
SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(),
|
||||
SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
|
||||
SqlOverHttpError::RequestTooLarge => ErrorKind::User,
|
||||
SqlOverHttpError::ResponseTooLarge => ErrorKind::User,
|
||||
SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
|
||||
SqlOverHttpError::Postgres(p) => p.get_error_kind(),
|
||||
SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres,
|
||||
SqlOverHttpError::Cancelled(c) => c.get_error_kind(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for SqlOverHttpError {
|
||||
fn to_string_client(&self) -> String {
|
||||
match self {
|
||||
SqlOverHttpError::ReadPayload(p) => p.to_string(),
|
||||
SqlOverHttpError::ConnectCompute(c) => c.to_string_client(),
|
||||
SqlOverHttpError::ConnInfo(c) => c.to_string_client(),
|
||||
SqlOverHttpError::RequestTooLarge => self.to_string(),
|
||||
SqlOverHttpError::ResponseTooLarge => self.to_string(),
|
||||
SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
|
||||
SqlOverHttpError::Postgres(p) => p.to_string(),
|
||||
SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(),
|
||||
SqlOverHttpError::Cancelled(_) => self.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ReadPayloadError {
|
||||
#[error("could not read the HTTP request body: {0}")]
|
||||
Read(#[from] hyper::Error),
|
||||
#[error("could not parse the HTTP request body: {0}")]
|
||||
Parse(#[from] serde_json::Error),
|
||||
}
|
||||
|
||||
impl ReportableError for ReadPayloadError {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
match self {
|
||||
ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect,
|
||||
ReadPayloadError::Parse(_) => ErrorKind::User,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SqlOverHttpCancel {
|
||||
#[error("query was cancelled")]
|
||||
Postgres,
|
||||
#[error("query was cancelled while stuck trying to connect to the database")]
|
||||
Connect,
|
||||
}
|
||||
|
||||
impl ReportableError for SqlOverHttpCancel {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
match self {
|
||||
SqlOverHttpCancel::Postgres => ErrorKind::RateLimit,
|
||||
SqlOverHttpCancel::Connect => ErrorKind::ServiceRateLimit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_inner(
|
||||
cancel: CancellationToken,
|
||||
config: &'static ProxyConfig,
|
||||
ctx: &mut RequestMonitoring,
|
||||
request: Request<Body>,
|
||||
backend: Arc<PoolingBackend>,
|
||||
) -> anyhow::Result<Response<Body>> {
|
||||
) -> Result<Response<Body>, SqlOverHttpError> {
|
||||
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
|
||||
.with_label_values(&[ctx.protocol])
|
||||
.guard();
|
||||
@@ -345,7 +473,7 @@ async fn handle_inner(
|
||||
b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
|
||||
b"ReadCommitted" => IsolationLevel::ReadCommitted,
|
||||
b"RepeatableRead" => IsolationLevel::RepeatableRead,
|
||||
_ => bail!("invalid isolation level"),
|
||||
_ => return Err(SqlOverHttpError::InvalidIsolationLevel),
|
||||
}),
|
||||
None => None,
|
||||
};
|
||||
@@ -363,19 +491,16 @@ async fn handle_inner(
|
||||
// we don't have a streaming request support yet so this is to prevent OOM
|
||||
// from a malicious user sending an extremely large request body
|
||||
if request_content_length > MAX_REQUEST_SIZE {
|
||||
return Err(anyhow::anyhow!(
|
||||
"request is too large (max is {MAX_REQUEST_SIZE} bytes)"
|
||||
));
|
||||
return Err(SqlOverHttpError::RequestTooLarge);
|
||||
}
|
||||
|
||||
let fetch_and_process_request = async {
|
||||
let body = hyper::body::to_bytes(request.into_body())
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
let body = hyper::body::to_bytes(request.into_body()).await?;
|
||||
info!(length = body.len(), "request payload read");
|
||||
let payload: Payload = serde_json::from_slice(&body)?;
|
||||
Ok::<Payload, anyhow::Error>(payload) // Adjust error type accordingly
|
||||
};
|
||||
Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
|
||||
}
|
||||
.map_err(SqlOverHttpError::from);
|
||||
|
||||
let authenticate_and_connect = async {
|
||||
let keys = backend.authenticate(ctx, &conn_info).await?;
|
||||
@@ -385,11 +510,25 @@ async fn handle_inner(
|
||||
// not strictly necessary to mark success here,
|
||||
// but it's just insurance for if we forget it somewhere else
|
||||
ctx.latency_timer.success();
|
||||
Ok::<_, anyhow::Error>(client)
|
||||
};
|
||||
Ok::<_, HttpConnError>(client)
|
||||
}
|
||||
.map_err(SqlOverHttpError::from);
|
||||
|
||||
// Run both operations in parallel
|
||||
let (payload, mut client) = try_join!(fetch_and_process_request, authenticate_and_connect)?;
|
||||
let (payload, mut client) = match select(
|
||||
try_join(
|
||||
pin!(fetch_and_process_request),
|
||||
pin!(authenticate_and_connect),
|
||||
),
|
||||
pin!(cancel.cancelled()),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Either::Left((result, _cancelled)) => result?,
|
||||
Either::Right((_cancelled, _)) => {
|
||||
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect))
|
||||
}
|
||||
};
|
||||
|
||||
let mut response = Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
@@ -401,19 +540,64 @@ async fn handle_inner(
|
||||
let mut size = 0;
|
||||
let result = match payload {
|
||||
Payload::Single(stmt) => {
|
||||
let (status, results) =
|
||||
query_to_json(&*client, stmt, &mut 0, raw_output, default_array_mode)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
client.discard();
|
||||
e
|
||||
})?;
|
||||
client.check_idle(status);
|
||||
results
|
||||
let mut size = 0;
|
||||
let (inner, mut discard) = client.inner();
|
||||
let cancel_token = inner.cancel_token();
|
||||
let query = pin!(query_to_json(
|
||||
&*inner,
|
||||
stmt,
|
||||
&mut size,
|
||||
raw_output,
|
||||
default_array_mode
|
||||
));
|
||||
let cancelled = pin!(cancel.cancelled());
|
||||
let res = select(query, cancelled).await;
|
||||
match res {
|
||||
Either::Left((Ok((status, results)), _cancelled)) => {
|
||||
discard.check_idle(status);
|
||||
results
|
||||
}
|
||||
Either::Left((Err(e), _cancelled)) => {
|
||||
discard.discard();
|
||||
return Err(e);
|
||||
}
|
||||
Either::Right((_cancelled, query)) => {
|
||||
if let Err(err) = cancel_token.cancel_query(NoTls).await {
|
||||
tracing::error!(?err, "could not cancel query");
|
||||
}
|
||||
match time::timeout(time::Duration::from_millis(100), query).await {
|
||||
Ok(Ok((status, results))) => {
|
||||
discard.check_idle(status);
|
||||
results
|
||||
}
|
||||
Ok(Err(error)) => {
|
||||
let db_error = match &error {
|
||||
SqlOverHttpError::ConnectCompute(
|
||||
HttpConnError::ConnectionError(e),
|
||||
)
|
||||
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// if errored for some other reason, it might not be safe to return
|
||||
if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
|
||||
discard.discard();
|
||||
}
|
||||
|
||||
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
|
||||
}
|
||||
Err(_timeout) => {
|
||||
discard.discard();
|
||||
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Payload::Batch(statements) => {
|
||||
info!("starting transaction");
|
||||
let (inner, mut discard) = client.inner();
|
||||
let cancel_token = inner.cancel_token();
|
||||
let mut builder = inner.build_transaction();
|
||||
if let Some(isolation_level) = txn_isolation_level {
|
||||
builder = builder.isolation_level(isolation_level);
|
||||
@@ -433,6 +617,7 @@ async fn handle_inner(
|
||||
})?;
|
||||
|
||||
let results = match query_batch(
|
||||
cancel.child_token(),
|
||||
&transaction,
|
||||
statements,
|
||||
&mut size,
|
||||
@@ -452,6 +637,15 @@ async fn handle_inner(
|
||||
discard.check_idle(status);
|
||||
results
|
||||
}
|
||||
Err(SqlOverHttpError::Cancelled(_)) => {
|
||||
if let Err(err) = cancel_token.cancel_query(NoTls).await {
|
||||
tracing::error!(?err, "could not cancel query");
|
||||
}
|
||||
// TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
|
||||
discard.discard();
|
||||
|
||||
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
|
||||
}
|
||||
Err(err) => {
|
||||
info!("rollback");
|
||||
let status = transaction.rollback().await.map_err(|e| {
|
||||
@@ -466,16 +660,10 @@ async fn handle_inner(
|
||||
};
|
||||
|
||||
if txn_read_only {
|
||||
response = response.header(
|
||||
TXN_READ_ONLY.clone(),
|
||||
HeaderValue::try_from(txn_read_only.to_string())?,
|
||||
);
|
||||
response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
|
||||
}
|
||||
if txn_deferrable {
|
||||
response = response.header(
|
||||
TXN_DEFERRABLE.clone(),
|
||||
HeaderValue::try_from(txn_deferrable.to_string())?,
|
||||
);
|
||||
response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
|
||||
}
|
||||
if let Some(txn_isolation_level) = txn_isolation_level_raw {
|
||||
response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
|
||||
@@ -503,19 +691,37 @@ async fn handle_inner(
|
||||
}
|
||||
|
||||
async fn query_batch(
|
||||
cancel: CancellationToken,
|
||||
transaction: &Transaction<'_>,
|
||||
queries: BatchQueryData,
|
||||
total_size: &mut usize,
|
||||
raw_output: bool,
|
||||
array_mode: bool,
|
||||
) -> anyhow::Result<Vec<Value>> {
|
||||
) -> Result<Vec<Value>, SqlOverHttpError> {
|
||||
let mut results = Vec::with_capacity(queries.queries.len());
|
||||
let mut current_size = 0;
|
||||
for stmt in queries.queries {
|
||||
// TODO: maybe we should check that the transaction bit is set here
|
||||
let (_, values) =
|
||||
query_to_json(transaction, stmt, &mut current_size, raw_output, array_mode).await?;
|
||||
results.push(values);
|
||||
let query = pin!(query_to_json(
|
||||
transaction,
|
||||
stmt,
|
||||
&mut current_size,
|
||||
raw_output,
|
||||
array_mode
|
||||
));
|
||||
let cancelled = pin!(cancel.cancelled());
|
||||
let res = select(query, cancelled).await;
|
||||
match res {
|
||||
// TODO: maybe we should check that the transaction bit is set here
|
||||
Either::Left((Ok((_, values)), _cancelled)) => {
|
||||
results.push(values);
|
||||
}
|
||||
Either::Left((Err(e), _cancelled)) => {
|
||||
return Err(e);
|
||||
}
|
||||
Either::Right((_cancelled, _)) => {
|
||||
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
|
||||
}
|
||||
}
|
||||
}
|
||||
*total_size += current_size;
|
||||
Ok(results)
|
||||
@@ -527,7 +733,7 @@ async fn query_to_json<T: GenericClient>(
|
||||
current_size: &mut usize,
|
||||
raw_output: bool,
|
||||
default_array_mode: bool,
|
||||
) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
|
||||
) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
|
||||
info!("executing query");
|
||||
let query_params = data.params;
|
||||
let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
|
||||
@@ -544,9 +750,7 @@ async fn query_to_json<T: GenericClient>(
|
||||
// we don't have a streaming response support yet so this is to prevent OOM
|
||||
// from a malicious query (eg a cross join)
|
||||
if *current_size > MAX_RESPONSE_SIZE {
|
||||
return Err(anyhow::anyhow!(
|
||||
"response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
|
||||
));
|
||||
return Err(SqlOverHttpError::ResponseTooLarge);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,186 +1,110 @@
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
pin::Pin,
|
||||
task::{Context, Poll},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use futures::{Future, Stream, StreamExt};
|
||||
use hyper::server::{accept::Accept, conn::AddrStream};
|
||||
use pin_project_lite::pin_project;
|
||||
use thiserror::Error;
|
||||
use tokio::{
|
||||
io::{AsyncRead, AsyncWrite},
|
||||
task::JoinSet,
|
||||
time::timeout,
|
||||
};
|
||||
use tokio_rustls::{server::TlsStream, TlsAcceptor};
|
||||
use tracing::{info, warn};
|
||||
|
||||
/// Default timeout for the TLS handshake.
|
||||
pub const DEFAULT_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
use crate::{
|
||||
metrics::TLS_HANDSHAKE_FAILURES,
|
||||
protocol2::{WithClientIp, WithConnectionGuard},
|
||||
};
|
||||
|
||||
/// Trait for TLS implementation.
|
||||
///
|
||||
/// Implementations are provided by the rustls and native-tls features.
|
||||
pub trait AsyncTls<C: AsyncRead + AsyncWrite>: Clone {
|
||||
/// The type of the TLS stream created from the underlying stream.
|
||||
type Stream: Send + 'static;
|
||||
/// Error type for completing the TLS handshake
|
||||
type Error: std::error::Error + Send + 'static;
|
||||
/// Type of the Future for the TLS stream that is accepted.
|
||||
type AcceptFuture: Future<Output = Result<Self::Stream, Self::Error>> + Send + 'static;
|
||||
|
||||
/// Accept a TLS connection on an underlying stream
|
||||
fn accept(&self, stream: C) -> Self::AcceptFuture;
|
||||
pin_project! {
|
||||
/// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
|
||||
/// encrypted using TLS.
|
||||
pub(crate) struct TlsListener<A: Accept> {
|
||||
#[pin]
|
||||
listener: A,
|
||||
tls: TlsAcceptor,
|
||||
waiting: JoinSet<Option<TlsStream<A::Conn>>>,
|
||||
timeout: Duration,
|
||||
protocol: &'static str,
|
||||
}
|
||||
}
|
||||
|
||||
/// Asynchronously accept connections.
|
||||
pub trait AsyncAccept {
|
||||
/// The type of the connection that is accepted.
|
||||
type Connection: AsyncRead + AsyncWrite;
|
||||
/// The type of error that may be returned.
|
||||
type Error;
|
||||
|
||||
/// Poll to accept the next connection.
|
||||
fn poll_accept(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
) -> Poll<Option<Result<Self::Connection, Self::Error>>>;
|
||||
|
||||
/// Return a new `AsyncAccept` that stops accepting connections after
|
||||
/// `ender` completes.
|
||||
///
|
||||
/// Useful for graceful shutdown.
|
||||
///
|
||||
/// See [examples/echo.rs](https://github.com/tmccombs/tls-listener/blob/main/examples/echo.rs)
|
||||
/// for example of how to use.
|
||||
fn until<F: Future>(self, ender: F) -> Until<Self, F>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
Until {
|
||||
acceptor: self,
|
||||
ender,
|
||||
impl<A: Accept> TlsListener<A> {
|
||||
/// Create a `TlsListener` with default options.
|
||||
pub(crate) fn new(
|
||||
tls: TlsAcceptor,
|
||||
listener: A,
|
||||
protocol: &'static str,
|
||||
timeout: Duration,
|
||||
) -> Self {
|
||||
TlsListener {
|
||||
listener,
|
||||
tls,
|
||||
waiting: JoinSet::new(),
|
||||
timeout,
|
||||
protocol,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pin_project! {
|
||||
///
|
||||
/// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
|
||||
/// encrypted using TLS.
|
||||
///
|
||||
/// It is similar to:
|
||||
///
|
||||
/// ```ignore
|
||||
/// tcpListener.and_then(|s| tlsAcceptor.accept(s))
|
||||
/// ```
|
||||
///
|
||||
/// except that it has the ability to accept multiple transport-level connections
|
||||
/// simultaneously while the TLS handshake is pending for other connections.
|
||||
///
|
||||
/// By default, if a client fails the TLS handshake, that is treated as an error, and the
|
||||
/// `TlsListener` will return an `Err`. If the `TlsListener` is passed directly to a hyper
|
||||
/// [`Server`][1], then an invalid handshake can cause the server to stop accepting connections.
|
||||
/// See [`http-stream.rs`][2] or [`http-low-level`][3] examples, for examples of how to avoid this.
|
||||
///
|
||||
/// Note that if the maximum number of pending connections is greater than 1, the resulting
|
||||
/// [`T::Stream`][4] connections may come in a different order than the connections produced by the
|
||||
/// underlying listener.
|
||||
///
|
||||
/// [1]: https://docs.rs/hyper/latest/hyper/server/struct.Server.html
|
||||
/// [2]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-stream.rs
|
||||
/// [3]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-low-level.rs
|
||||
/// [4]: AsyncTls::Stream
|
||||
///
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub struct TlsListener<A: AsyncAccept, T: AsyncTls<A::Connection>> {
|
||||
#[pin]
|
||||
listener: A,
|
||||
tls: T,
|
||||
waiting: JoinSet<Result<Result<T::Stream, T::Error>, tokio::time::error::Elapsed>>,
|
||||
timeout: Duration,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder for `TlsListener`.
|
||||
#[derive(Clone)]
|
||||
pub struct Builder<T> {
|
||||
tls: T,
|
||||
handshake_timeout: Duration,
|
||||
}
|
||||
|
||||
/// Wraps errors from either the listener or the TLS Acceptor
|
||||
#[derive(Debug, Error)]
|
||||
pub enum Error<LE: std::error::Error, TE: std::error::Error> {
|
||||
/// An error that arose from the listener ([AsyncAccept::Error])
|
||||
#[error("{0}")]
|
||||
ListenerError(#[source] LE),
|
||||
/// An error that occurred during the TLS accept handshake
|
||||
#[error("{0}")]
|
||||
TlsAcceptError(#[source] TE),
|
||||
}
|
||||
|
||||
impl<A: AsyncAccept, T> TlsListener<A, T>
|
||||
impl<A> Accept for TlsListener<A>
|
||||
where
|
||||
T: AsyncTls<A::Connection>,
|
||||
{
|
||||
/// Create a `TlsListener` with default options.
|
||||
pub fn new(tls: T, listener: A) -> Self {
|
||||
builder(tls).listen(listener)
|
||||
}
|
||||
}
|
||||
|
||||
impl<A, T> TlsListener<A, T>
|
||||
where
|
||||
A: AsyncAccept,
|
||||
A: Accept<Conn = WithConnectionGuard<WithClientIp<AddrStream>>>,
|
||||
A::Error: std::error::Error,
|
||||
T: AsyncTls<A::Connection>,
|
||||
A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static,
|
||||
{
|
||||
/// Accept the next connection
|
||||
///
|
||||
/// This is essentially an alias to `self.next()` with a more domain-appropriate name.
|
||||
pub async fn accept(&mut self) -> Option<<Self as Stream>::Item>
|
||||
where
|
||||
Self: Unpin,
|
||||
{
|
||||
self.next().await
|
||||
}
|
||||
type Conn = TlsStream<A::Conn>;
|
||||
|
||||
/// Replaces the Tls Acceptor configuration, which will be used for new connections.
|
||||
///
|
||||
/// This can be used to change the certificate used at runtime.
|
||||
pub fn replace_acceptor(&mut self, acceptor: T) {
|
||||
self.tls = acceptor;
|
||||
}
|
||||
type Error = Infallible;
|
||||
|
||||
/// Replaces the Tls Acceptor configuration from a pinned reference to `Self`.
|
||||
///
|
||||
/// This is useful if your listener is `!Unpin`.
|
||||
///
|
||||
/// This can be used to change the certificate used at runtime.
|
||||
pub fn replace_acceptor_pin(self: Pin<&mut Self>, acceptor: T) {
|
||||
*self.project().tls = acceptor;
|
||||
}
|
||||
}
|
||||
|
||||
impl<A, T> Stream for TlsListener<A, T>
|
||||
where
|
||||
A: AsyncAccept,
|
||||
A::Error: std::error::Error,
|
||||
T: AsyncTls<A::Connection>,
|
||||
{
|
||||
type Item = Result<T::Stream, Error<A::Error, T::Error>>;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
fn poll_accept(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
|
||||
let mut this = self.project();
|
||||
|
||||
loop {
|
||||
match this.listener.as_mut().poll_accept(cx) {
|
||||
Poll::Pending => break,
|
||||
Poll::Ready(Some(Ok(conn))) => {
|
||||
this.waiting
|
||||
.spawn(timeout(*this.timeout, this.tls.accept(conn)));
|
||||
Poll::Ready(Some(Ok(mut conn))) => {
|
||||
let t = *this.timeout;
|
||||
let tls = this.tls.clone();
|
||||
let protocol = *this.protocol;
|
||||
this.waiting.spawn(async move {
|
||||
let peer_addr = match conn.inner.wait_for_addr().await {
|
||||
Ok(Some(addr)) => addr,
|
||||
Err(e) => {
|
||||
tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
|
||||
return None;
|
||||
}
|
||||
Ok(None) => conn.inner.inner.remote_addr()
|
||||
};
|
||||
|
||||
let accept = tls.accept(conn);
|
||||
match timeout(t, accept).await {
|
||||
Ok(Ok(conn)) => Some(conn),
|
||||
// The handshake failed, try getting another connection from the queue
|
||||
Ok(Err(e)) => {
|
||||
TLS_HANDSHAKE_FAILURES.inc();
|
||||
warn!(%peer_addr, protocol, "failed to accept TLS connection: {e:?}");
|
||||
None
|
||||
}
|
||||
// The handshake timed out, try getting another connection from the queue
|
||||
Err(_) => {
|
||||
TLS_HANDSHAKE_FAILURES.inc();
|
||||
warn!(%peer_addr, protocol, "failed to accept TLS connection: timeout");
|
||||
None
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
Poll::Ready(Some(Err(e))) => {
|
||||
return Poll::Ready(Some(Err(Error::ListenerError(e))));
|
||||
tracing::error!("error accepting TCP connection: {e}");
|
||||
continue;
|
||||
}
|
||||
Poll::Ready(None) => return Poll::Ready(None),
|
||||
}
|
||||
@@ -188,96 +112,19 @@ where
|
||||
|
||||
loop {
|
||||
return match this.waiting.poll_join_next(cx) {
|
||||
Poll::Ready(Some(Ok(Ok(conn)))) => {
|
||||
Poll::Ready(Some(conn.map_err(Error::TlsAcceptError)))
|
||||
Poll::Ready(Some(Ok(Some(conn)))) => {
|
||||
info!(protocol = this.protocol, "accepted new TLS connection");
|
||||
Poll::Ready(Some(Ok(conn)))
|
||||
}
|
||||
// The handshake timed out, try getting another connection from the queue
|
||||
Poll::Ready(Some(Ok(Err(_)))) => continue,
|
||||
// The handshake panicked
|
||||
Poll::Ready(Some(Err(e))) if e.is_panic() => {
|
||||
std::panic::resume_unwind(e.into_panic())
|
||||
// The handshake failed to complete, try getting another connection from the queue
|
||||
Poll::Ready(Some(Ok(None))) => continue,
|
||||
// The handshake panicked or was cancelled. ignore and get another connection
|
||||
Poll::Ready(Some(Err(e))) => {
|
||||
tracing::warn!("handshake aborted: {e}");
|
||||
continue;
|
||||
}
|
||||
// The handshake was externally aborted
|
||||
Poll::Ready(Some(Err(_))) => unreachable!("handshake tasks are never aborted"),
|
||||
_ => Poll::Pending,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: AsyncRead + AsyncWrite + Unpin + Send + 'static> AsyncTls<C> for tokio_rustls::TlsAcceptor {
|
||||
type Stream = tokio_rustls::server::TlsStream<C>;
|
||||
type Error = std::io::Error;
|
||||
type AcceptFuture = tokio_rustls::Accept<C>;
|
||||
|
||||
fn accept(&self, conn: C) -> Self::AcceptFuture {
|
||||
tokio_rustls::TlsAcceptor::accept(self, conn)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Builder<T> {
|
||||
/// Set the timeout for handshakes.
|
||||
///
|
||||
/// If a timeout takes longer than `timeout`, then the handshake will be
|
||||
/// aborted and the underlying connection will be dropped.
|
||||
///
|
||||
/// Defaults to `DEFAULT_HANDSHAKE_TIMEOUT`.
|
||||
pub fn handshake_timeout(&mut self, timeout: Duration) -> &mut Self {
|
||||
self.handshake_timeout = timeout;
|
||||
self
|
||||
}
|
||||
|
||||
/// Create a `TlsListener` from the builder
|
||||
///
|
||||
/// Actually build the `TlsListener`. The `listener` argument should be
|
||||
/// an implementation of the `AsyncAccept` trait that accepts new connections
|
||||
/// that the `TlsListener` will encrypt using TLS.
|
||||
pub fn listen<A: AsyncAccept>(&self, listener: A) -> TlsListener<A, T>
|
||||
where
|
||||
T: AsyncTls<A::Connection>,
|
||||
{
|
||||
TlsListener {
|
||||
listener,
|
||||
tls: self.tls.clone(),
|
||||
waiting: JoinSet::new(),
|
||||
timeout: self.handshake_timeout,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new Builder for a TlsListener
|
||||
///
|
||||
/// `server_config` will be used to configure the TLS sessions.
|
||||
pub fn builder<T>(tls: T) -> Builder<T> {
|
||||
Builder {
|
||||
tls,
|
||||
handshake_timeout: DEFAULT_HANDSHAKE_TIMEOUT,
|
||||
}
|
||||
}
|
||||
|
||||
pin_project! {
|
||||
/// See [`AsyncAccept::until`]
|
||||
pub struct Until<A, E> {
|
||||
#[pin]
|
||||
acceptor: A,
|
||||
#[pin]
|
||||
ender: E,
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: AsyncAccept, E: Future> AsyncAccept for Until<A, E> {
|
||||
type Connection = A::Connection;
|
||||
type Error = A::Error;
|
||||
|
||||
fn poll_accept(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
|
||||
let this = self.project();
|
||||
|
||||
match this.ender.poll(cx) {
|
||||
Poll::Pending => this.acceptor.poll_accept(cx),
|
||||
Poll::Ready(_) => Poll::Ready(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,6 +140,13 @@ pub static BROKER_ITERATION_TIMELINES: Lazy<Histogram> = Lazy::new(|| {
|
||||
)
|
||||
.expect("Failed to register safekeeper_broker_iteration_timelines histogram vec")
|
||||
});
|
||||
pub static RECEIVED_PS_FEEDBACKS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"safekeeper_received_ps_feedbacks_total",
|
||||
"Number of pageserver feedbacks received"
|
||||
)
|
||||
.expect("Failed to register safekeeper_received_ps_feedbacks_total counter")
|
||||
});
|
||||
|
||||
pub const LABEL_UNKNOWN: &str = "unknown";
|
||||
|
||||
@@ -301,7 +308,8 @@ pub async fn time_io_closure<E: Into<anyhow::Error>>(
|
||||
#[derive(Clone)]
|
||||
pub struct FullTimelineInfo {
|
||||
pub ttid: TenantTimelineId,
|
||||
pub ps_feedback: PageserverFeedback,
|
||||
pub ps_feedback_count: u64,
|
||||
pub last_ps_feedback: PageserverFeedback,
|
||||
pub wal_backup_active: bool,
|
||||
pub timeline_is_active: bool,
|
||||
pub num_computes: u32,
|
||||
@@ -327,6 +335,7 @@ pub struct TimelineCollector {
|
||||
remote_consistent_lsn: GenericGaugeVec<AtomicU64>,
|
||||
ps_last_received_lsn: GenericGaugeVec<AtomicU64>,
|
||||
feedback_last_time_seconds: GenericGaugeVec<AtomicU64>,
|
||||
ps_feedback_count: GenericGaugeVec<AtomicU64>,
|
||||
timeline_active: GenericGaugeVec<AtomicU64>,
|
||||
wal_backup_active: GenericGaugeVec<AtomicU64>,
|
||||
connected_computes: IntGaugeVec,
|
||||
@@ -430,6 +439,15 @@ impl TimelineCollector {
|
||||
.unwrap();
|
||||
descs.extend(feedback_last_time_seconds.desc().into_iter().cloned());
|
||||
|
||||
let ps_feedback_count = GenericGaugeVec::new(
|
||||
Opts::new(
|
||||
"safekeeper_ps_feedback_count_total",
|
||||
"Number of feedbacks received from the pageserver",
|
||||
),
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let timeline_active = GenericGaugeVec::new(
|
||||
Opts::new(
|
||||
"safekeeper_timeline_active",
|
||||
@@ -538,6 +556,7 @@ impl TimelineCollector {
|
||||
remote_consistent_lsn,
|
||||
ps_last_received_lsn,
|
||||
feedback_last_time_seconds,
|
||||
ps_feedback_count,
|
||||
timeline_active,
|
||||
wal_backup_active,
|
||||
connected_computes,
|
||||
@@ -570,6 +589,7 @@ impl Collector for TimelineCollector {
|
||||
self.remote_consistent_lsn.reset();
|
||||
self.ps_last_received_lsn.reset();
|
||||
self.feedback_last_time_seconds.reset();
|
||||
self.ps_feedback_count.reset();
|
||||
self.timeline_active.reset();
|
||||
self.wal_backup_active.reset();
|
||||
self.connected_computes.reset();
|
||||
@@ -646,9 +666,12 @@ impl Collector for TimelineCollector {
|
||||
|
||||
self.ps_last_received_lsn
|
||||
.with_label_values(labels)
|
||||
.set(tli.ps_feedback.last_received_lsn.0);
|
||||
.set(tli.last_ps_feedback.last_received_lsn.0);
|
||||
self.ps_feedback_count
|
||||
.with_label_values(labels)
|
||||
.set(tli.ps_feedback_count);
|
||||
if let Ok(unix_time) = tli
|
||||
.ps_feedback
|
||||
.last_ps_feedback
|
||||
.replytime
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
{
|
||||
@@ -679,6 +702,7 @@ impl Collector for TimelineCollector {
|
||||
mfs.extend(self.remote_consistent_lsn.collect());
|
||||
mfs.extend(self.ps_last_received_lsn.collect());
|
||||
mfs.extend(self.feedback_last_time_seconds.collect());
|
||||
mfs.extend(self.ps_feedback_count.collect());
|
||||
mfs.extend(self.timeline_active.collect());
|
||||
mfs.extend(self.wal_backup_active.collect());
|
||||
mfs.extend(self.connected_computes.collect());
|
||||
|
||||
@@ -36,11 +36,15 @@ use tokio::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
|
||||
const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
|
||||
|
||||
/// Registry of WalReceivers (compute connections). Timeline holds it (wrapped
|
||||
/// in Arc).
|
||||
pub struct WalReceivers {
|
||||
mutex: Mutex<WalReceiversShared>,
|
||||
pageserver_feedback_tx: tokio::sync::broadcast::Sender<PageserverFeedback>,
|
||||
}
|
||||
|
||||
/// Id under which walreceiver is registered in shmem.
|
||||
@@ -48,8 +52,12 @@ type WalReceiverId = usize;
|
||||
|
||||
impl WalReceivers {
|
||||
pub fn new() -> Arc<WalReceivers> {
|
||||
let (pageserver_feedback_tx, _) =
|
||||
tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY);
|
||||
|
||||
Arc::new(WalReceivers {
|
||||
mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
|
||||
pageserver_feedback_tx,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -116,6 +124,12 @@ impl WalReceivers {
|
||||
let mut shared = self.mutex.lock();
|
||||
shared.slots[id] = None;
|
||||
}
|
||||
|
||||
/// Broadcast pageserver feedback to connected walproposers.
|
||||
pub fn broadcast_pageserver_feedback(&self, feedback: PageserverFeedback) {
|
||||
// Err means there is no subscribers, it is fine.
|
||||
let _ = self.pageserver_feedback_tx.send(feedback);
|
||||
}
|
||||
}
|
||||
|
||||
/// Only a few connections are expected (normally one), so store in Vec.
|
||||
@@ -197,17 +211,28 @@ impl SafekeeperPostgresHandler {
|
||||
// sends, so this avoids deadlocks.
|
||||
let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?;
|
||||
let peer_addr = *pgb.get_peer_addr();
|
||||
let network_reader = NetworkReader {
|
||||
let mut network_reader = NetworkReader {
|
||||
ttid: self.ttid,
|
||||
conn_id: self.conn_id,
|
||||
pgb_reader: &mut pgb_reader,
|
||||
peer_addr,
|
||||
acceptor_handle: &mut acceptor_handle,
|
||||
};
|
||||
let res = tokio::select! {
|
||||
// todo: add read|write .context to these errors
|
||||
r = network_reader.run(msg_tx, msg_rx, reply_tx) => r,
|
||||
r = network_write(pgb, reply_rx) => r,
|
||||
|
||||
// Read first message and create timeline if needed.
|
||||
let res = network_reader.read_first_message().await;
|
||||
|
||||
let res = if let Ok((tli, next_msg)) = res {
|
||||
let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
|
||||
tli.get_walreceivers().pageserver_feedback_tx.subscribe();
|
||||
|
||||
tokio::select! {
|
||||
// todo: add read|write .context to these errors
|
||||
r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r,
|
||||
r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
|
||||
}
|
||||
} else {
|
||||
res.map(|_| ())
|
||||
};
|
||||
|
||||
// Join pg backend back.
|
||||
@@ -251,12 +276,9 @@ struct NetworkReader<'a, IO> {
|
||||
}
|
||||
|
||||
impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
async fn run(
|
||||
self,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
async fn read_first_message(
|
||||
&mut self,
|
||||
) -> Result<(Arc<Timeline>, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
|
||||
// Receive information about server to create timeline, if not yet.
|
||||
let next_msg = read_message(self.pgb_reader).await?;
|
||||
let tli = match next_msg {
|
||||
@@ -278,9 +300,19 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
)))
|
||||
}
|
||||
};
|
||||
Ok((tli, next_msg))
|
||||
}
|
||||
|
||||
async fn run(
|
||||
self,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
tli: Arc<Timeline>,
|
||||
next_msg: ProposerAcceptorMessage,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
*self.acceptor_handle = Some(WalAcceptor::spawn(
|
||||
tli.clone(),
|
||||
tli,
|
||||
msg_rx,
|
||||
reply_tx,
|
||||
Some(self.conn_id),
|
||||
@@ -320,18 +352,46 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_writer: &mut PostgresBackend<IO>,
|
||||
mut reply_rx: Receiver<AcceptorProposerMessage>,
|
||||
mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut buf = BytesMut::with_capacity(128);
|
||||
|
||||
// storing append_response to inject PageserverFeedback into it
|
||||
let mut last_append_response = None;
|
||||
|
||||
loop {
|
||||
match reply_rx.recv().await {
|
||||
Some(msg) => {
|
||||
buf.clear();
|
||||
msg.serialize(&mut buf)?;
|
||||
pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
|
||||
// trying to read either AcceptorProposerMessage or PageserverFeedback
|
||||
let msg = tokio::select! {
|
||||
reply = reply_rx.recv() => {
|
||||
if let Some(msg) = reply {
|
||||
if let AcceptorProposerMessage::AppendResponse(append_response) = &msg {
|
||||
last_append_response = Some(append_response.clone());
|
||||
}
|
||||
Some(msg)
|
||||
} else {
|
||||
return Ok(()); // chan closed, WalAcceptor terminated
|
||||
}
|
||||
}
|
||||
None => return Ok(()), // chan closed, WalAcceptor terminated
|
||||
}
|
||||
|
||||
feedback = pageserver_feedback_rx.recv() =>
|
||||
match (feedback, &last_append_response) {
|
||||
(Ok(feedback), Some(append_response)) => {
|
||||
// clone AppendResponse and inject PageserverFeedback into it
|
||||
let mut append_response = append_response.clone();
|
||||
append_response.pageserver_feedback = Some(feedback);
|
||||
Some(AcceptorProposerMessage::AppendResponse(append_response))
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
};
|
||||
|
||||
let Some(msg) = msg else {
|
||||
continue;
|
||||
};
|
||||
|
||||
buf.clear();
|
||||
msg.serialize(&mut buf)?;
|
||||
pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -321,7 +321,7 @@ pub struct AppendRequestHeader {
|
||||
}
|
||||
|
||||
/// Report safekeeper state to proposer
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
pub struct AppendResponse {
|
||||
// Current term of the safekeeper; if it is higher than proposer's, the
|
||||
// compute is out of date.
|
||||
@@ -334,7 +334,7 @@ pub struct AppendResponse {
|
||||
// a criterion for walproposer --sync mode exit
|
||||
pub commit_lsn: Lsn,
|
||||
pub hs_feedback: HotStandbyFeedback,
|
||||
pub pageserver_feedback: PageserverFeedback,
|
||||
pub pageserver_feedback: Option<PageserverFeedback>,
|
||||
}
|
||||
|
||||
impl AppendResponse {
|
||||
@@ -344,7 +344,7 @@ impl AppendResponse {
|
||||
flush_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
pageserver_feedback: PageserverFeedback::empty(),
|
||||
pageserver_feedback: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -462,7 +462,11 @@ impl AcceptorProposerMessage {
|
||||
buf.put_u64_le(msg.hs_feedback.xmin);
|
||||
buf.put_u64_le(msg.hs_feedback.catalog_xmin);
|
||||
|
||||
msg.pageserver_feedback.serialize(buf);
|
||||
// AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
|
||||
// if it is not present.
|
||||
if let Some(ref msg) = msg.pageserver_feedback {
|
||||
msg.serialize(buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -681,7 +685,7 @@ where
|
||||
commit_lsn: self.state.commit_lsn,
|
||||
// will be filled by the upper code to avoid bothering safekeeper
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
pageserver_feedback: PageserverFeedback::empty(),
|
||||
pageserver_feedback: None,
|
||||
};
|
||||
trace!("formed AppendResponse {:?}", ar);
|
||||
ar
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
//! with the "START_REPLICATION" message, and registry of walsenders.
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::metrics::RECEIVED_PS_FEEDBACKS;
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::safekeeper::{Term, TermLsn};
|
||||
use crate::timeline::Timeline;
|
||||
use crate::wal_service::ConnectionId;
|
||||
@@ -21,7 +23,7 @@ use utils::failpoint_support;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
|
||||
use std::cmp::{max, min};
|
||||
use std::cmp::min;
|
||||
use std::net::SocketAddr;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
@@ -90,12 +92,14 @@ pub struct StandbyFeedback {
|
||||
/// WalSenders registry. Timeline holds it (wrapped in Arc).
|
||||
pub struct WalSenders {
|
||||
mutex: Mutex<WalSendersShared>,
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
}
|
||||
|
||||
impl WalSenders {
|
||||
pub fn new() -> Arc<WalSenders> {
|
||||
pub fn new(walreceivers: Arc<WalReceivers>) -> Arc<WalSenders> {
|
||||
Arc::new(WalSenders {
|
||||
mutex: Mutex::new(WalSendersShared::new()),
|
||||
walreceivers,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -151,22 +155,29 @@ impl WalSenders {
|
||||
.min()
|
||||
}
|
||||
|
||||
/// Get aggregated pageserver feedback.
|
||||
pub fn get_ps_feedback(self: &Arc<WalSenders>) -> PageserverFeedback {
|
||||
self.mutex.lock().agg_ps_feedback
|
||||
/// Returns total counter of pageserver feedbacks received and last feedback.
|
||||
pub fn get_ps_feedback_stats(self: &Arc<WalSenders>) -> (u64, PageserverFeedback) {
|
||||
let shared = self.mutex.lock();
|
||||
(shared.ps_feedback_counter, shared.last_ps_feedback)
|
||||
}
|
||||
|
||||
/// Get aggregated pageserver and hot standby feedback (we send them to compute).
|
||||
pub fn get_feedbacks(self: &Arc<WalSenders>) -> (PageserverFeedback, HotStandbyFeedback) {
|
||||
let shared = self.mutex.lock();
|
||||
(shared.agg_ps_feedback, shared.agg_hs_feedback)
|
||||
/// Get aggregated hot standby feedback (we send it to compute).
|
||||
pub fn get_hotstandby(self: &Arc<WalSenders>) -> HotStandbyFeedback {
|
||||
self.mutex.lock().agg_hs_feedback
|
||||
}
|
||||
|
||||
/// Record new pageserver feedback, update aggregated values.
|
||||
fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
|
||||
let mut shared = self.mutex.lock();
|
||||
shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
|
||||
shared.update_ps_feedback();
|
||||
shared.last_ps_feedback = *feedback;
|
||||
shared.ps_feedback_counter += 1;
|
||||
drop(shared);
|
||||
|
||||
RECEIVED_PS_FEEDBACKS.inc();
|
||||
|
||||
// send feedback to connected walproposers
|
||||
self.walreceivers.broadcast_pageserver_feedback(*feedback);
|
||||
}
|
||||
|
||||
/// Record standby reply.
|
||||
@@ -222,8 +233,10 @@ impl WalSenders {
|
||||
struct WalSendersShared {
|
||||
// aggregated over all walsenders value
|
||||
agg_hs_feedback: HotStandbyFeedback,
|
||||
// aggregated over all walsenders value
|
||||
agg_ps_feedback: PageserverFeedback,
|
||||
// last feedback ever received from any pageserver, empty if none
|
||||
last_ps_feedback: PageserverFeedback,
|
||||
// total counter of pageserver feedbacks received
|
||||
ps_feedback_counter: u64,
|
||||
slots: Vec<Option<WalSenderState>>,
|
||||
}
|
||||
|
||||
@@ -231,7 +244,8 @@ impl WalSendersShared {
|
||||
fn new() -> Self {
|
||||
WalSendersShared {
|
||||
agg_hs_feedback: HotStandbyFeedback::empty(),
|
||||
agg_ps_feedback: PageserverFeedback::empty(),
|
||||
last_ps_feedback: PageserverFeedback::empty(),
|
||||
ps_feedback_counter: 0,
|
||||
slots: Vec::new(),
|
||||
}
|
||||
}
|
||||
@@ -276,37 +290,6 @@ impl WalSendersShared {
|
||||
}
|
||||
self.agg_hs_feedback = agg;
|
||||
}
|
||||
|
||||
/// Update aggregated pageserver feedback. LSNs (last_received,
|
||||
/// disk_consistent, remote_consistent) and reply timestamp are just
|
||||
/// maximized; timeline_size if taken from feedback with highest
|
||||
/// last_received lsn. This is generally reasonable, but we might want to
|
||||
/// implement other policies once multiple pageservers start to be actively
|
||||
/// used.
|
||||
fn update_ps_feedback(&mut self) {
|
||||
let init = PageserverFeedback::empty();
|
||||
let acc =
|
||||
self.slots
|
||||
.iter()
|
||||
.flatten()
|
||||
.fold(init, |mut acc, ws_state| match ws_state.feedback {
|
||||
ReplicationFeedback::Pageserver(feedback) => {
|
||||
if feedback.last_received_lsn > acc.last_received_lsn {
|
||||
acc.current_timeline_size = feedback.current_timeline_size;
|
||||
}
|
||||
acc.last_received_lsn =
|
||||
max(feedback.last_received_lsn, acc.last_received_lsn);
|
||||
acc.disk_consistent_lsn =
|
||||
max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn);
|
||||
acc.remote_consistent_lsn =
|
||||
max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn);
|
||||
acc.replytime = max(feedback.replytime, acc.replytime);
|
||||
acc
|
||||
}
|
||||
ReplicationFeedback::Standby(_) => acc,
|
||||
});
|
||||
self.agg_ps_feedback = acc;
|
||||
}
|
||||
}
|
||||
|
||||
// Serialized is used only for pretty printing in json.
|
||||
@@ -443,7 +426,7 @@ impl SafekeeperPostgresHandler {
|
||||
};
|
||||
let mut reply_reader = ReplyReader {
|
||||
reader,
|
||||
ws_guard,
|
||||
ws_guard: ws_guard.clone(),
|
||||
tli,
|
||||
};
|
||||
|
||||
@@ -452,6 +435,18 @@ impl SafekeeperPostgresHandler {
|
||||
r = sender.run() => r,
|
||||
r = reply_reader.run() => r,
|
||||
};
|
||||
|
||||
let ws_state = ws_guard
|
||||
.walsenders
|
||||
.mutex
|
||||
.lock()
|
||||
.get_slot(ws_guard.id)
|
||||
.clone();
|
||||
info!(
|
||||
"finished streaming to {}, feedback={:?}",
|
||||
ws_state.addr, ws_state.feedback,
|
||||
);
|
||||
|
||||
// Join pg backend back.
|
||||
pgb.unsplit(reply_reader.reader)?;
|
||||
|
||||
@@ -733,7 +728,6 @@ async fn wait_for_lsn(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use postgres_protocol::PG_EPOCH;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::*;
|
||||
@@ -792,27 +786,4 @@ mod tests {
|
||||
wss.update_hs_feedback();
|
||||
assert_eq!(wss.agg_hs_feedback.xmin, 42);
|
||||
}
|
||||
|
||||
// form pageserver feedback with given last_record_lsn / tli size and the
|
||||
// rest set to dummy values.
|
||||
fn ps_feedback(current_timeline_size: u64, last_received_lsn: Lsn) -> ReplicationFeedback {
|
||||
ReplicationFeedback::Pageserver(PageserverFeedback {
|
||||
current_timeline_size,
|
||||
last_received_lsn,
|
||||
disk_consistent_lsn: Lsn::INVALID,
|
||||
remote_consistent_lsn: Lsn::INVALID,
|
||||
replytime: *PG_EPOCH,
|
||||
})
|
||||
}
|
||||
|
||||
// test that ps aggregation works as expected
|
||||
#[test]
|
||||
fn test_ps_feedback() {
|
||||
let mut wss = WalSendersShared::new();
|
||||
push_feedback(&mut wss, ps_feedback(8, Lsn(42)));
|
||||
push_feedback(&mut wss, ps_feedback(4, Lsn(84)));
|
||||
wss.update_ps_feedback();
|
||||
assert_eq!(wss.agg_ps_feedback.current_timeline_size, 4);
|
||||
assert_eq!(wss.agg_ps_feedback.last_received_lsn, Lsn(84));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -402,6 +402,7 @@ impl Timeline {
|
||||
)));
|
||||
let (cancellation_tx, cancellation_rx) = watch::channel(false);
|
||||
|
||||
let walreceivers = WalReceivers::new();
|
||||
Ok(Timeline {
|
||||
ttid,
|
||||
wal_backup_launcher_tx,
|
||||
@@ -410,8 +411,8 @@ impl Timeline {
|
||||
term_flush_lsn_watch_tx,
|
||||
term_flush_lsn_watch_rx,
|
||||
mutex: Mutex::new(shared_state),
|
||||
walsenders: WalSenders::new(),
|
||||
walreceivers: WalReceivers::new(),
|
||||
walsenders: WalSenders::new(walreceivers.clone()),
|
||||
walreceivers,
|
||||
cancellation_rx,
|
||||
cancellation_tx,
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
@@ -435,6 +436,7 @@ impl Timeline {
|
||||
let state =
|
||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
|
||||
|
||||
let walreceivers = WalReceivers::new();
|
||||
Ok(Timeline {
|
||||
ttid,
|
||||
wal_backup_launcher_tx,
|
||||
@@ -443,8 +445,8 @@ impl Timeline {
|
||||
term_flush_lsn_watch_tx,
|
||||
term_flush_lsn_watch_rx,
|
||||
mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
|
||||
walsenders: WalSenders::new(),
|
||||
walreceivers: WalReceivers::new(),
|
||||
walsenders: WalSenders::new(walreceivers.clone()),
|
||||
walreceivers,
|
||||
cancellation_rx,
|
||||
cancellation_tx,
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
@@ -656,12 +658,9 @@ impl Timeline {
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
rmsg = shared_state.sk.process_msg(msg).await?;
|
||||
|
||||
// if this is AppendResponse, fill in proper pageserver and hot
|
||||
// standby feedback.
|
||||
// if this is AppendResponse, fill in proper hot standby feedback.
|
||||
if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
|
||||
let (ps_feedback, hs_feedback) = self.walsenders.get_feedbacks();
|
||||
resp.hs_feedback = hs_feedback;
|
||||
resp.pageserver_feedback = ps_feedback;
|
||||
resp.hs_feedback = self.walsenders.get_hotstandby();
|
||||
}
|
||||
|
||||
commit_lsn = shared_state.sk.state.inmem.commit_lsn;
|
||||
@@ -898,12 +897,13 @@ impl Timeline {
|
||||
return None;
|
||||
}
|
||||
|
||||
let ps_feedback = self.walsenders.get_ps_feedback();
|
||||
let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
|
||||
let state = self.write_shared_state().await;
|
||||
if state.active {
|
||||
Some(FullTimelineInfo {
|
||||
ttid: self.ttid,
|
||||
ps_feedback,
|
||||
ps_feedback_count,
|
||||
last_ps_feedback,
|
||||
wal_backup_active: state.wal_backup_active,
|
||||
timeline_is_active: state.active,
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
|
||||
@@ -196,6 +196,7 @@ pub struct SimulationApi {
|
||||
safekeepers: RefCell<Vec<SafekeeperConn>>,
|
||||
disk: Arc<DiskWalProposer>,
|
||||
redo_start_lsn: Option<Lsn>,
|
||||
last_logged_commit_lsn: u64,
|
||||
shmem: UnsafeCell<walproposer::bindings::WalproposerShmemState>,
|
||||
config: Config,
|
||||
event_set: RefCell<Option<EventSet>>,
|
||||
@@ -228,6 +229,7 @@ impl SimulationApi {
|
||||
safekeepers: RefCell::new(sk_conns),
|
||||
disk: args.disk,
|
||||
redo_start_lsn: args.redo_start_lsn,
|
||||
last_logged_commit_lsn: 0,
|
||||
shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
|
||||
mutex: 0,
|
||||
feedback: PageserverFeedback {
|
||||
@@ -596,14 +598,11 @@ impl ApiImpl for SimulationApi {
|
||||
}
|
||||
}
|
||||
|
||||
fn process_safekeeper_feedback(
|
||||
&self,
|
||||
wp: &mut walproposer::bindings::WalProposer,
|
||||
commit_lsn: u64,
|
||||
) {
|
||||
debug!("process_safekeeper_feedback, commit_lsn={}", commit_lsn);
|
||||
if commit_lsn > wp.lastSentCommitLsn {
|
||||
self.os.log_event(format!("commit_lsn;{}", commit_lsn));
|
||||
fn process_safekeeper_feedback(&mut self, wp: &mut walproposer::bindings::WalProposer) {
|
||||
debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn);
|
||||
if wp.commitLsn > self.last_logged_commit_lsn {
|
||||
self.os.log_event(format!("commit_lsn;{}", wp.commitLsn));
|
||||
self.last_logged_commit_lsn = wp.commitLsn;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,8 @@ FLAKY_TESTS_QUERY = """
|
||||
DISTINCT parent_suite, suite, name
|
||||
FROM results
|
||||
WHERE
|
||||
started_at > CURRENT_DATE - INTERVAL '%s' day
|
||||
started_at > CURRENT_DATE - INTERVAL '10' day
|
||||
AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
|
||||
AND (
|
||||
(status IN ('failed', 'broken') AND reference = 'refs/heads/main')
|
||||
OR flaky
|
||||
@@ -46,11 +47,14 @@ def main(args: argparse.Namespace):
|
||||
logging.error("cannot fetch flaky tests from the DB due to an error", exc)
|
||||
rows = []
|
||||
|
||||
# If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not std-fs),
|
||||
# If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring),
|
||||
# use it to parametrize test name along with build_type and pg_version
|
||||
#
|
||||
# See test_runner/fixtures/parametrize.py for details
|
||||
if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
|
||||
if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
|
||||
"",
|
||||
"tokio-epoll-uring",
|
||||
):
|
||||
pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
|
||||
else:
|
||||
pageserver_virtual_file_io_engine_parameter = ""
|
||||
|
||||
@@ -15,11 +15,11 @@ import threading
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import closing, contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from fcntl import LOCK_EX, LOCK_UN, flock
|
||||
from functools import cached_property
|
||||
from functools import cached_property, partial
|
||||
from itertools import chain, product
|
||||
from pathlib import Path
|
||||
from types import TracebackType
|
||||
@@ -70,6 +70,8 @@ from fixtures.remote_storage import (
|
||||
default_remote_storage,
|
||||
remote_storage_to_toml_inline_table,
|
||||
)
|
||||
from fixtures.safekeeper.http import SafekeeperHttpClient
|
||||
from fixtures.safekeeper.utils import are_walreceivers_absent
|
||||
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.utils import (
|
||||
ATTACHMENT_NAME_REGEX,
|
||||
@@ -517,9 +519,9 @@ class NeonEnvBuilder:
|
||||
self.env = NeonEnv(self)
|
||||
return self.env
|
||||
|
||||
def start(self):
|
||||
def start(self, register_pageservers=False):
|
||||
assert self.env is not None, "environment is not already initialized, call init() first"
|
||||
self.env.start()
|
||||
self.env.start(register_pageservers=register_pageservers)
|
||||
|
||||
def init_start(
|
||||
self,
|
||||
@@ -1012,24 +1014,24 @@ class NeonEnv:
|
||||
self.initial_tenant = config.initial_tenant
|
||||
self.initial_timeline = config.initial_timeline
|
||||
|
||||
# Find two adjacent ports for attachment service and its postgres DB. This
|
||||
# Find two adjacent ports for storage controller and its postgres DB. This
|
||||
# loop would eventually throw from get_port() if we run out of ports (extremely
|
||||
# unlikely): usually we find two adjacent free ports on the first iteration.
|
||||
while True:
|
||||
self.attachment_service_port = self.port_distributor.get_port()
|
||||
attachment_service_pg_port = self.port_distributor.get_port()
|
||||
if attachment_service_pg_port == self.attachment_service_port + 1:
|
||||
self.storage_controller_port = self.port_distributor.get_port()
|
||||
storage_controller_pg_port = self.port_distributor.get_port()
|
||||
if storage_controller_pg_port == self.storage_controller_port + 1:
|
||||
break
|
||||
|
||||
# The URL for the pageserver to use as its control_plane_api config
|
||||
self.control_plane_api: str = f"http://127.0.0.1:{self.attachment_service_port}/upcall/v1"
|
||||
# The base URL of the attachment service
|
||||
self.attachment_service_api: str = f"http://127.0.0.1:{self.attachment_service_port}"
|
||||
self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1"
|
||||
# The base URL of the storage controller
|
||||
self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}"
|
||||
|
||||
# For testing this with a fake HTTP server, enable passing through a URL from config
|
||||
self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
|
||||
|
||||
self.attachment_service: NeonAttachmentService = NeonAttachmentService(
|
||||
self.storage_controller: NeonStorageController = NeonStorageController(
|
||||
self, config.auth_enabled
|
||||
)
|
||||
|
||||
@@ -1110,17 +1112,22 @@ class NeonEnv:
|
||||
log.info(f"Config: {cfg}")
|
||||
self.neon_cli.init(cfg, force=config.config_init_force)
|
||||
|
||||
def start(self):
|
||||
# Attachment service starts first, so that pageserver /re-attach calls don't
|
||||
def start(self, register_pageservers=False):
|
||||
# storage controller starts first, so that pageserver /re-attach calls don't
|
||||
# bounce through retries on startup
|
||||
self.attachment_service.start()
|
||||
self.storage_controller.start()
|
||||
|
||||
def attachment_service_ready():
|
||||
assert self.attachment_service.ready() is True
|
||||
def storage_controller_ready():
|
||||
assert self.storage_controller.ready() is True
|
||||
|
||||
# Wait for attachment service readiness to prevent unnecessary post start-up
|
||||
# Wait for storage controller readiness to prevent unnecessary post start-up
|
||||
# reconcile.
|
||||
wait_until(30, 1, attachment_service_ready)
|
||||
wait_until(30, 1, storage_controller_ready)
|
||||
|
||||
if register_pageservers:
|
||||
# Special case for forward compat tests, this can be removed later.
|
||||
for pageserver in self.pageservers:
|
||||
self.storage_controller.node_register(pageserver)
|
||||
|
||||
# Start up broker, pageserver and all safekeepers
|
||||
futs = []
|
||||
@@ -1151,7 +1158,7 @@ class NeonEnv:
|
||||
if ps_assert_metric_no_errors:
|
||||
pageserver.assert_no_metric_errors()
|
||||
pageserver.stop(immediate=immediate)
|
||||
self.attachment_service.stop(immediate=immediate)
|
||||
self.storage_controller.stop(immediate=immediate)
|
||||
self.broker.stop(immediate=immediate)
|
||||
|
||||
@property
|
||||
@@ -1186,9 +1193,9 @@ class NeonEnv:
|
||||
def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]):
|
||||
"""
|
||||
Get the NeonPageserver where this tenant shard is currently attached, according
|
||||
to the attachment service.
|
||||
to the storage controller.
|
||||
"""
|
||||
meta = self.attachment_service.inspect(tenant_id)
|
||||
meta = self.storage_controller.inspect(tenant_id)
|
||||
if meta is None:
|
||||
return None
|
||||
pageserver_id = meta[1]
|
||||
@@ -1518,7 +1525,6 @@ class NeonCli(AbstractNeonCli):
|
||||
conf: Optional[Dict[str, Any]] = None,
|
||||
shard_count: Optional[int] = None,
|
||||
shard_stripe_size: Optional[int] = None,
|
||||
placement_policy: Optional[str] = None,
|
||||
set_default: bool = False,
|
||||
) -> Tuple[TenantId, TimelineId]:
|
||||
"""
|
||||
@@ -1552,9 +1558,6 @@ class NeonCli(AbstractNeonCli):
|
||||
if shard_stripe_size is not None:
|
||||
args.extend(["--shard-stripe-size", str(shard_stripe_size)])
|
||||
|
||||
if placement_policy is not None:
|
||||
args.extend(["--placement-policy", str(placement_policy)])
|
||||
|
||||
res = self.raw_cli(args)
|
||||
res.check_returncode()
|
||||
return tenant_id, timeline_id
|
||||
@@ -1699,12 +1702,12 @@ class NeonCli(AbstractNeonCli):
|
||||
res.check_returncode()
|
||||
return res
|
||||
|
||||
def attachment_service_start(self):
|
||||
cmd = ["attachment_service", "start"]
|
||||
def storage_controller_start(self):
|
||||
cmd = ["storage_controller", "start"]
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def attachment_service_stop(self, immediate: bool):
|
||||
cmd = ["attachment_service", "stop"]
|
||||
def storage_controller_stop(self, immediate: bool):
|
||||
cmd = ["storage_controller", "stop"]
|
||||
if immediate:
|
||||
cmd.extend(["-m", "immediate"])
|
||||
return self.raw_cli(cmd)
|
||||
@@ -1714,10 +1717,8 @@ class NeonCli(AbstractNeonCli):
|
||||
id: int,
|
||||
overrides: Tuple[str, ...] = (),
|
||||
extra_env_vars: Optional[Dict[str, str]] = None,
|
||||
register: bool = True,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
register_str = "true" if register else "false"
|
||||
start_args = ["pageserver", "start", f"--id={id}", *overrides, f"--register={register_str}"]
|
||||
start_args = ["pageserver", "start", f"--id={id}", *overrides]
|
||||
storage = self.env.pageserver_remote_storage
|
||||
append_pageserver_param_overrides(
|
||||
params_to_update=start_args,
|
||||
@@ -1944,14 +1945,14 @@ class Pagectl(AbstractNeonCli):
|
||||
return IndexPartDump.from_json(parsed)
|
||||
|
||||
|
||||
class AttachmentServiceApiException(Exception):
|
||||
class StorageControllerApiException(Exception):
|
||||
def __init__(self, message, status_code: int):
|
||||
super().__init__(message)
|
||||
self.message = message
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
class NeonAttachmentService(MetricsGetter):
|
||||
class NeonStorageController(MetricsGetter):
|
||||
def __init__(self, env: NeonEnv, auth_enabled: bool):
|
||||
self.env = env
|
||||
self.running = False
|
||||
@@ -1959,13 +1960,13 @@ class NeonAttachmentService(MetricsGetter):
|
||||
|
||||
def start(self):
|
||||
assert not self.running
|
||||
self.env.neon_cli.attachment_service_start()
|
||||
self.env.neon_cli.storage_controller_start()
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
def stop(self, immediate: bool = False) -> "NeonAttachmentService":
|
||||
def stop(self, immediate: bool = False) -> "NeonStorageController":
|
||||
if self.running:
|
||||
self.env.neon_cli.attachment_service_stop(immediate)
|
||||
self.env.neon_cli.storage_controller_stop(immediate)
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
@@ -1978,22 +1979,22 @@ class NeonAttachmentService(MetricsGetter):
|
||||
msg = res.json()["msg"]
|
||||
except: # noqa: E722
|
||||
msg = ""
|
||||
raise AttachmentServiceApiException(msg, res.status_code) from e
|
||||
raise StorageControllerApiException(msg, res.status_code) from e
|
||||
|
||||
def pageserver_api(self) -> PageserverHttpClient:
|
||||
"""
|
||||
The attachment service implements a subset of the pageserver REST API, for mapping
|
||||
The storage controller implements a subset of the pageserver REST API, for mapping
|
||||
per-tenant actions into per-shard actions (e.g. timeline creation). Tests should invoke those
|
||||
functions via the HttpClient, as an implicit check that these APIs remain compatible.
|
||||
"""
|
||||
auth_token = None
|
||||
if self.auth_enabled:
|
||||
auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
|
||||
return PageserverHttpClient(self.env.attachment_service_port, lambda: True, auth_token)
|
||||
return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token)
|
||||
|
||||
def request(self, method, *args, **kwargs) -> requests.Response:
|
||||
resp = requests.request(method, *args, **kwargs)
|
||||
NeonAttachmentService.raise_api_exception(resp)
|
||||
NeonStorageController.raise_api_exception(resp)
|
||||
|
||||
return resp
|
||||
|
||||
@@ -2006,15 +2007,15 @@ class NeonAttachmentService(MetricsGetter):
|
||||
return headers
|
||||
|
||||
def get_metrics(self) -> Metrics:
|
||||
res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
|
||||
res = self.request("GET", f"{self.env.storage_controller_api}/metrics")
|
||||
return parse_metrics(res.text)
|
||||
|
||||
def ready(self) -> bool:
|
||||
status = None
|
||||
try:
|
||||
resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
|
||||
resp = self.request("GET", f"{self.env.storage_controller_api}/ready")
|
||||
status = resp.status_code
|
||||
except AttachmentServiceApiException as e:
|
||||
except StorageControllerApiException as e:
|
||||
status = e.status_code
|
||||
|
||||
if status == 503:
|
||||
@@ -2029,7 +2030,7 @@ class NeonAttachmentService(MetricsGetter):
|
||||
) -> int:
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.env.attachment_service_api}/debug/v1/attach-hook",
|
||||
f"{self.env.storage_controller_api}/debug/v1/attach-hook",
|
||||
json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2040,7 +2041,7 @@ class NeonAttachmentService(MetricsGetter):
|
||||
def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.env.attachment_service_api}/debug/v1/attach-hook",
|
||||
f"{self.env.storage_controller_api}/debug/v1/attach-hook",
|
||||
json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2051,7 +2052,7 @@ class NeonAttachmentService(MetricsGetter):
|
||||
"""
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.env.attachment_service_api}/debug/v1/inspect",
|
||||
f"{self.env.storage_controller_api}/debug/v1/inspect",
|
||||
json={"tenant_shard_id": str(tenant_shard_id)},
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2068,11 +2069,13 @@ class NeonAttachmentService(MetricsGetter):
|
||||
"node_id": int(node.id),
|
||||
"listen_http_addr": "localhost",
|
||||
"listen_http_port": node.service_port.http,
|
||||
"listen_pg_addr": "localhost",
|
||||
"listen_pg_port": node.service_port.pg,
|
||||
}
|
||||
log.info(f"node_register({body})")
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.env.attachment_service_api}/control/v1/node",
|
||||
f"{self.env.storage_controller_api}/control/v1/node",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2080,7 +2083,7 @@ class NeonAttachmentService(MetricsGetter):
|
||||
def node_list(self):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.env.attachment_service_api}/control/v1/node",
|
||||
f"{self.env.storage_controller_api}/control/v1/node",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
return response.json()
|
||||
@@ -2090,7 +2093,7 @@ class NeonAttachmentService(MetricsGetter):
|
||||
body["node_id"] = node_id
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.env.attachment_service_api}/control/v1/node/{node_id}/config",
|
||||
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2120,7 +2123,7 @@ class NeonAttachmentService(MetricsGetter):
|
||||
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.env.attachment_service_api}/v1/tenant",
|
||||
f"{self.env.storage_controller_api}/v1/tenant",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.PAGE_SERVER_API),
|
||||
)
|
||||
@@ -2132,18 +2135,20 @@ class NeonAttachmentService(MetricsGetter):
|
||||
"""
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate",
|
||||
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
body = response.json()
|
||||
shards: list[dict[str, Any]] = body["shards"]
|
||||
return shards
|
||||
|
||||
def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
|
||||
def tenant_shard_split(
|
||||
self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
|
||||
) -> list[TenantShardId]:
|
||||
response = self.request(
|
||||
"PUT",
|
||||
f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/shard_split",
|
||||
json={"new_shard_count": shard_count},
|
||||
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
|
||||
json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size},
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
body = response.json()
|
||||
@@ -2154,7 +2159,7 @@ class NeonAttachmentService(MetricsGetter):
|
||||
def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_shard_id}/migrate",
|
||||
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate",
|
||||
json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2167,43 +2172,12 @@ class NeonAttachmentService(MetricsGetter):
|
||||
"""
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.env.attachment_service_api}/debug/v1/consistency_check",
|
||||
f"{self.env.storage_controller_api}/debug/v1/consistency_check",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
log.info("Attachment service passed consistency check")
|
||||
log.info("storage controller passed consistency check")
|
||||
|
||||
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
|
||||
if isinstance(config_strings, tuple):
|
||||
pairs = [config_strings]
|
||||
else:
|
||||
pairs = config_strings
|
||||
|
||||
log.info(f"Requesting config failpoints: {repr(pairs)}")
|
||||
|
||||
res = self.request(
|
||||
"PUT",
|
||||
f"{self.env.attachment_service_api}/debug/v1/failpoints",
|
||||
json=[{"name": name, "actions": actions} for name, actions in pairs],
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
log.info(f"Got failpoints request response code {res.status_code}")
|
||||
res.raise_for_status()
|
||||
|
||||
def balance_all(self):
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.env.attachment_service_api}/control/v1/balance/all",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def balance_attached(self):
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.env.attachment_service_api}/control/v1/balance/attached",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def __enter__(self) -> "NeonAttachmentService":
|
||||
def __enter__(self) -> "NeonStorageController":
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
@@ -2266,7 +2240,6 @@ class NeonPageserver(PgProtocol):
|
||||
self,
|
||||
overrides: Tuple[str, ...] = (),
|
||||
extra_env_vars: Optional[Dict[str, str]] = None,
|
||||
register: bool = True,
|
||||
) -> "NeonPageserver":
|
||||
"""
|
||||
Start the page server.
|
||||
@@ -2276,7 +2249,7 @@ class NeonPageserver(PgProtocol):
|
||||
assert self.running is False
|
||||
|
||||
self.env.neon_cli.pageserver_start(
|
||||
self.id, overrides=overrides, extra_env_vars=extra_env_vars, register=register
|
||||
self.id, overrides=overrides, extra_env_vars=extra_env_vars
|
||||
)
|
||||
self.running = True
|
||||
return self
|
||||
@@ -2357,16 +2330,16 @@ class NeonPageserver(PgProtocol):
|
||||
def assert_no_errors(self):
|
||||
logfile = self.workdir / "pageserver.log"
|
||||
if not logfile.exists():
|
||||
log.warning(f"Skipping log check on pageserver {self.id}: {logfile} does not exist")
|
||||
log.warning(f"Skipping log check: {logfile} does not exist")
|
||||
return
|
||||
|
||||
with logfile.open("r") as f:
|
||||
errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
|
||||
|
||||
for _lineno, error in errors:
|
||||
log.info(f"not allowed error (pageserver {self.id}): {error.strip()}")
|
||||
log.info(f"not allowed error: {error.strip()}")
|
||||
|
||||
assert not errors, f"Pageserver {self.id}: {errors}"
|
||||
assert not errors
|
||||
|
||||
def assert_no_metric_errors(self):
|
||||
"""
|
||||
@@ -2434,7 +2407,7 @@ class NeonPageserver(PgProtocol):
|
||||
"""
|
||||
client = self.http_client()
|
||||
if generation is None:
|
||||
generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
|
||||
generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
|
||||
return client.tenant_attach(
|
||||
tenant_id,
|
||||
config,
|
||||
@@ -2443,14 +2416,14 @@ class NeonPageserver(PgProtocol):
|
||||
)
|
||||
|
||||
def tenant_detach(self, tenant_id: TenantId):
|
||||
self.env.attachment_service.attach_hook_drop(tenant_id)
|
||||
self.env.storage_controller.attach_hook_drop(tenant_id)
|
||||
|
||||
client = self.http_client()
|
||||
return client.tenant_detach(tenant_id)
|
||||
|
||||
def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
|
||||
if config["mode"].startswith("Attached") and "generation" not in config:
|
||||
config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
|
||||
config["generation"] = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
|
||||
|
||||
client = self.http_client()
|
||||
return client.tenant_location_conf(tenant_id, config, **kwargs)
|
||||
@@ -2474,14 +2447,14 @@ class NeonPageserver(PgProtocol):
|
||||
generation: Optional[int] = None,
|
||||
) -> TenantId:
|
||||
if generation is None:
|
||||
generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
|
||||
generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
|
||||
client = self.http_client(auth_token=auth_token)
|
||||
return client.tenant_create(tenant_id, conf, generation=generation)
|
||||
|
||||
def tenant_load(self, tenant_id: TenantId):
|
||||
client = self.http_client()
|
||||
return client.tenant_load(
|
||||
tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
|
||||
tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
|
||||
)
|
||||
|
||||
|
||||
@@ -2582,6 +2555,20 @@ class PgBin:
|
||||
)
|
||||
return base_path
|
||||
|
||||
def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn:
|
||||
"""
|
||||
Run pg_controldata on given datadir and extract checkpoint lsn.
|
||||
"""
|
||||
|
||||
pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata")
|
||||
cmd = f"{pg_controldata_path} -D {pgdata}"
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
|
||||
checkpoint_lsn = re.findall(
|
||||
"Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
|
||||
)[0]
|
||||
log.info(f"last checkpoint at {checkpoint_lsn}")
|
||||
return Lsn(checkpoint_lsn)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
|
||||
@@ -2878,6 +2865,7 @@ class NeonProxy(PgProtocol):
|
||||
self.auth_backend = auth_backend
|
||||
self.metric_collection_endpoint = metric_collection_endpoint
|
||||
self.metric_collection_interval = metric_collection_interval
|
||||
self.http_timeout_seconds = 15
|
||||
self._popen: Optional[subprocess.Popen[bytes]] = None
|
||||
|
||||
def start(self) -> NeonProxy:
|
||||
@@ -2916,6 +2904,7 @@ class NeonProxy(PgProtocol):
|
||||
*["--proxy", f"{self.host}:{self.proxy_port}"],
|
||||
*["--mgmt", f"{self.host}:{self.mgmt_port}"],
|
||||
*["--wss", f"{self.host}:{self.external_http_port}"],
|
||||
*["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"],
|
||||
*["-c", str(crt_path)],
|
||||
*["-k", str(key_path)],
|
||||
*self.auth_backend.extra_args(),
|
||||
@@ -2956,6 +2945,8 @@ class NeonProxy(PgProtocol):
|
||||
password = quote(kwargs["password"])
|
||||
expected_code = kwargs.get("expected_code")
|
||||
|
||||
log.info(f"Executing http query: {query}")
|
||||
|
||||
connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
|
||||
response = requests.post(
|
||||
f"https://{self.domain}:{self.external_http_port}/sql",
|
||||
@@ -2978,6 +2969,8 @@ class NeonProxy(PgProtocol):
|
||||
password = kwargs["password"]
|
||||
expected_code = kwargs.get("expected_code")
|
||||
|
||||
log.info(f"Executing http2 query: {query}")
|
||||
|
||||
connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
|
||||
async with httpx.AsyncClient(
|
||||
http2=True, verify=str(self.test_output_dir / "proxy.crt")
|
||||
@@ -3600,220 +3593,6 @@ class Safekeeper:
|
||||
return segments
|
||||
|
||||
|
||||
# Walreceiver as returned by sk's timeline status endpoint.
|
||||
@dataclass
|
||||
class Walreceiver:
|
||||
conn_id: int
|
||||
state: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
acceptor_epoch: int
|
||||
pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
flush_lsn: Lsn
|
||||
commit_lsn: Lsn
|
||||
timeline_start_lsn: Lsn
|
||||
backup_lsn: Lsn
|
||||
peer_horizon_lsn: Lsn
|
||||
remote_consistent_lsn: Lsn
|
||||
walreceivers: List[Walreceiver]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperMetrics:
|
||||
# These are metrics from Prometheus which uses float64 internally.
|
||||
# As a consequence, values may differ from real original int64s.
|
||||
flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
|
||||
commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
|
||||
|
||||
|
||||
class SafekeeperHttpClient(requests.Session):
|
||||
HTTPError = requests.HTTPError
|
||||
|
||||
def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
|
||||
super().__init__()
|
||||
self.port = port
|
||||
self.auth_token = auth_token
|
||||
self.is_testing_enabled = is_testing_enabled
|
||||
|
||||
if auth_token is not None:
|
||||
self.headers["Authorization"] = f"Bearer {auth_token}"
|
||||
|
||||
def check_status(self):
|
||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def is_testing_enabled_or_skip(self):
|
||||
if not self.is_testing_enabled:
|
||||
pytest.skip("safekeeper was built without 'testing' feature")
|
||||
|
||||
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
if isinstance(config_strings, tuple):
|
||||
pairs = [config_strings]
|
||||
else:
|
||||
pairs = config_strings
|
||||
|
||||
log.info(f"Requesting config failpoints: {repr(pairs)}")
|
||||
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/failpoints",
|
||||
json=[{"name": name, "actions": actions} for name, actions in pairs],
|
||||
)
|
||||
log.info(f"Got failpoints request response code {res.status_code}")
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
return res_json
|
||||
|
||||
def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||
params = params or {}
|
||||
res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
|
||||
res.raise_for_status()
|
||||
res_json = json.loads(res.text)
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def patch_control_file(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
patch: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
res = self.patch(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
|
||||
json={
|
||||
"updates": patch,
|
||||
"apply_fields": list(patch.keys()),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
|
||||
res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
|
||||
json=body,
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_digest(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
|
||||
) -> Dict[str, Any]:
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
|
||||
params={
|
||||
"from_lsn": str(from_lsn),
|
||||
"until_lsn": str(until_lsn),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
commit_lsn: Lsn,
|
||||
):
|
||||
body = {
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
"pg_version": pg_version,
|
||||
"commit_lsn": str(commit_lsn),
|
||||
}
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_status(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId
|
||||
) -> SafekeeperTimelineStatus:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
|
||||
res.raise_for_status()
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
return SafekeeperTimelineStatus(
|
||||
acceptor_epoch=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
flush_lsn=Lsn(resj["flush_lsn"]),
|
||||
commit_lsn=Lsn(resj["commit_lsn"]),
|
||||
timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
|
||||
backup_lsn=Lsn(resj["backup_lsn"]),
|
||||
peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
|
||||
remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
|
||||
walreceivers=walreceivers,
|
||||
)
|
||||
|
||||
def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
|
||||
json=body,
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
# only_local doesn't remove segments in the remote storage.
|
||||
def timeline_delete(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
|
||||
) -> Dict[Any, Any]:
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
params={
|
||||
"only_local": str(only_local).lower(),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
|
||||
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def get_metrics_str(self) -> str:
|
||||
request_result = self.get(f"http://localhost:{self.port}/metrics")
|
||||
request_result.raise_for_status()
|
||||
return request_result.text
|
||||
|
||||
def get_metrics(self) -> SafekeeperMetrics:
|
||||
all_metrics_text = self.get_metrics_str()
|
||||
|
||||
metrics = SafekeeperMetrics()
|
||||
for match in re.finditer(
|
||||
r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE,
|
||||
):
|
||||
metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
|
||||
match.group(3)
|
||||
)
|
||||
for match in re.finditer(
|
||||
r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE,
|
||||
):
|
||||
metrics.commit_lsn_inexact[
|
||||
(TenantId(match.group(1)), TimelineId(match.group(2)))
|
||||
] = int(match.group(3))
|
||||
return metrics
|
||||
|
||||
|
||||
class S3Scrubber:
|
||||
def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
|
||||
self.env = env
|
||||
@@ -4123,32 +3902,29 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
|
||||
|
||||
# pg is the existing and running compute node, that we want to compare with a basebackup
|
||||
def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
|
||||
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||
|
||||
# Get the timeline ID. We need it for the 'basebackup' command
|
||||
timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
|
||||
|
||||
# many tests already checkpoint, but do it just in case
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CHECKPOINT")
|
||||
|
||||
# wait for pageserver to catch up
|
||||
wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
|
||||
# stop postgres to ensure that files won't change
|
||||
endpoint.stop()
|
||||
|
||||
# Read the shutdown checkpoint's LSN
|
||||
checkpoint_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(endpoint.pg_data_dir_path())
|
||||
|
||||
# Take a basebackup from pageserver
|
||||
restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
|
||||
restored_dir_path.mkdir(exist_ok=True)
|
||||
|
||||
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||
psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
|
||||
|
||||
pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
|
||||
pageserver_id = env.storage_controller.locate(endpoint.tenant_id)[0]["node_id"]
|
||||
cmd = rf"""
|
||||
{psql_path} \
|
||||
--no-psqlrc \
|
||||
postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg} \
|
||||
-c 'basebackup {endpoint.tenant_id} {timeline_id}' \
|
||||
-c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}' \
|
||||
| tar -x -C {restored_dir_path}
|
||||
"""
|
||||
|
||||
@@ -4207,18 +3983,27 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
|
||||
|
||||
def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -> Lsn:
|
||||
"""Wait logical replication subscriber to sync with publisher."""
|
||||
publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
|
||||
while True:
|
||||
|
||||
def is_synced(publisher_lsn):
|
||||
# Even if pg_stat_subscription.latest_end_lsn is caughtup, some tables
|
||||
# might not be synced because until sync worker finishes main apply
|
||||
# continues to advance.
|
||||
rels_synced = subscriber.safe_psql(
|
||||
"select count(*) = 0 from pg_subscription_rel where srsubstate != 'r'"
|
||||
)[0][0]
|
||||
log.info(f"number of not synced rels: {rels_synced}")
|
||||
assert rels_synced
|
||||
res = subscriber.safe_psql("select latest_end_lsn from pg_catalog.pg_stat_subscription")[0][
|
||||
0
|
||||
]
|
||||
if res:
|
||||
log.info(f"subscriber_lsn={res}")
|
||||
subscriber_lsn = Lsn(res)
|
||||
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}")
|
||||
if subscriber_lsn >= publisher_lsn:
|
||||
return subscriber_lsn
|
||||
time.sleep(0.5)
|
||||
log.info(f"subscriber_lsn={res}")
|
||||
subscriber_lsn = Lsn(res)
|
||||
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}")
|
||||
assert subscriber_lsn >= publisher_lsn
|
||||
|
||||
publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
|
||||
wait_until(30, 0.5, partial(is_synced, publisher_lsn))
|
||||
return publisher_lsn
|
||||
|
||||
|
||||
def tenant_get_shards(
|
||||
@@ -4230,7 +4015,7 @@ def tenant_get_shards(
|
||||
us to figure out the shards for a tenant.
|
||||
|
||||
If the caller provides `pageserver_id`, it will be used for all shards, even
|
||||
if the shard is indicated by attachment service to be on some other pageserver.
|
||||
if the shard is indicated by storage controller to be on some other pageserver.
|
||||
|
||||
Caller should over the response to apply their per-pageserver action to
|
||||
each shard
|
||||
@@ -4246,7 +4031,7 @@ def tenant_get_shards(
|
||||
TenantShardId.parse(s["shard_id"]),
|
||||
override_pageserver or env.get_pageserver(s["node_id"]),
|
||||
)
|
||||
for s in env.attachment_service.locate(tenant_id)
|
||||
for s in env.storage_controller.locate(tenant_id)
|
||||
]
|
||||
else:
|
||||
# Assume an unsharded tenant
|
||||
@@ -4297,6 +4082,49 @@ def wait_for_last_flush_lsn(
|
||||
return min(results)
|
||||
|
||||
|
||||
def flush_ep_to_pageserver(
|
||||
env: NeonEnv,
|
||||
ep: Endpoint,
|
||||
tenant: TenantId,
|
||||
timeline: TimelineId,
|
||||
pageserver_id: Optional[int] = None,
|
||||
) -> Lsn:
|
||||
"""
|
||||
Stop endpoint and wait until all committed WAL reaches the pageserver
|
||||
(last_record_lsn). This is for use by tests which want everything written so
|
||||
far to reach pageserver *and* expecting that no more data will arrive until
|
||||
endpoint starts again, so unlike wait_for_last_flush_lsn it polls
|
||||
safekeepers instead of compute to learn LSN.
|
||||
|
||||
Returns the catch up LSN.
|
||||
"""
|
||||
ep.stop()
|
||||
|
||||
commit_lsn: Lsn = Lsn(0)
|
||||
# In principle in the absense of failures polling single sk would be enough.
|
||||
for sk in env.safekeepers:
|
||||
cli = sk.http_client()
|
||||
# wait until compute connections are gone
|
||||
wait_until(30, 0.5, partial(are_walreceivers_absent, cli, tenant, timeline))
|
||||
commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn)
|
||||
|
||||
# Note: depending on WAL filtering implementation, probably most shards
|
||||
# won't be able to reach commit_lsn (unless gaps are also ack'ed), so this
|
||||
# is broken in sharded case.
|
||||
shards = tenant_get_shards(env, tenant, pageserver_id)
|
||||
for tenant_shard_id, pageserver in shards:
|
||||
log.info(
|
||||
f"flush_ep_to_pageserver: waiting for {commit_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})"
|
||||
)
|
||||
waited = wait_for_last_record_lsn(
|
||||
pageserver.http_client(), tenant_shard_id, timeline, commit_lsn
|
||||
)
|
||||
|
||||
assert waited >= commit_lsn
|
||||
|
||||
return commit_lsn
|
||||
|
||||
|
||||
def wait_for_wal_insert_lsn(
|
||||
env: NeonEnv,
|
||||
endpoint: Endpoint,
|
||||
|
||||
@@ -318,6 +318,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
assert isinstance(res_json["tenant_shards"], list)
|
||||
return res_json
|
||||
|
||||
def tenant_get_location(self, tenant_id: TenantShardId):
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/location_config/{tenant_id}",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]):
|
||||
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
self.verbose_error(res)
|
||||
|
||||
@@ -43,7 +43,7 @@ def single_timeline(
|
||||
log.info("detach template tenant form pageserver")
|
||||
env.pageserver.tenant_detach(template_tenant)
|
||||
env.pageserver.allowed_errors.append(
|
||||
# tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
|
||||
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
|
||||
".*Dropped remote consistent LSN updates.*",
|
||||
)
|
||||
|
||||
|
||||
@@ -46,9 +46,12 @@ def pytest_generate_tests(metafunc: Metafunc):
|
||||
|
||||
metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
|
||||
|
||||
# A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring`
|
||||
# And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics
|
||||
if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
|
||||
# A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=std-fs`
|
||||
# And do not change test name for default `pageserver_virtual_file_io_engine=tokio-epoll-uring` to keep tests statistics
|
||||
if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
|
||||
"",
|
||||
"tokio-epoll-uring",
|
||||
):
|
||||
metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
|
||||
|
||||
# For performance tests, parametrize also by platform
|
||||
|
||||
0
test_runner/fixtures/safekeeper/__init__.py
Normal file
0
test_runner/fixtures/safekeeper/__init__.py
Normal file
227
test_runner/fixtures/safekeeper/http.py
Normal file
227
test_runner/fixtures/safekeeper/http.py
Normal file
@@ -0,0 +1,227 @@
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
|
||||
|
||||
# Walreceiver as returned by sk's timeline status endpoint.
|
||||
@dataclass
|
||||
class Walreceiver:
|
||||
conn_id: int
|
||||
state: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
acceptor_epoch: int
|
||||
pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
flush_lsn: Lsn
|
||||
commit_lsn: Lsn
|
||||
timeline_start_lsn: Lsn
|
||||
backup_lsn: Lsn
|
||||
peer_horizon_lsn: Lsn
|
||||
remote_consistent_lsn: Lsn
|
||||
walreceivers: List[Walreceiver]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperMetrics:
|
||||
# These are metrics from Prometheus which uses float64 internally.
|
||||
# As a consequence, values may differ from real original int64s.
|
||||
flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
|
||||
commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
|
||||
|
||||
|
||||
class SafekeeperHttpClient(requests.Session):
|
||||
HTTPError = requests.HTTPError
|
||||
|
||||
def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
|
||||
super().__init__()
|
||||
self.port = port
|
||||
self.auth_token = auth_token
|
||||
self.is_testing_enabled = is_testing_enabled
|
||||
|
||||
if auth_token is not None:
|
||||
self.headers["Authorization"] = f"Bearer {auth_token}"
|
||||
|
||||
def check_status(self):
|
||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def is_testing_enabled_or_skip(self):
|
||||
if not self.is_testing_enabled:
|
||||
pytest.skip("safekeeper was built without 'testing' feature")
|
||||
|
||||
def configure_failpoints(self, config_strings: Union[Tuple[str, str], List[Tuple[str, str]]]):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
if isinstance(config_strings, tuple):
|
||||
pairs = [config_strings]
|
||||
else:
|
||||
pairs = config_strings
|
||||
|
||||
log.info(f"Requesting config failpoints: {repr(pairs)}")
|
||||
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/failpoints",
|
||||
json=[{"name": name, "actions": actions} for name, actions in pairs],
|
||||
)
|
||||
log.info(f"Got failpoints request response code {res.status_code}")
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
return res_json
|
||||
|
||||
def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||
params = params or {}
|
||||
res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
|
||||
res.raise_for_status()
|
||||
res_json = json.loads(res.text)
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def patch_control_file(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
patch: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
res = self.patch(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
|
||||
json={
|
||||
"updates": patch,
|
||||
"apply_fields": list(patch.keys()),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
|
||||
res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
|
||||
json=body,
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_digest(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
|
||||
) -> Dict[str, Any]:
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
|
||||
params={
|
||||
"from_lsn": str(from_lsn),
|
||||
"until_lsn": str(until_lsn),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
commit_lsn: Lsn,
|
||||
):
|
||||
body = {
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
"pg_version": pg_version,
|
||||
"commit_lsn": str(commit_lsn),
|
||||
}
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_status(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId
|
||||
) -> SafekeeperTimelineStatus:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
|
||||
res.raise_for_status()
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
return SafekeeperTimelineStatus(
|
||||
acceptor_epoch=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
flush_lsn=Lsn(resj["flush_lsn"]),
|
||||
commit_lsn=Lsn(resj["commit_lsn"]),
|
||||
timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
|
||||
backup_lsn=Lsn(resj["backup_lsn"]),
|
||||
peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
|
||||
remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
|
||||
walreceivers=walreceivers,
|
||||
)
|
||||
|
||||
def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
|
||||
return self.timeline_status(tenant_id, timeline_id).commit_lsn
|
||||
|
||||
def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
|
||||
json=body,
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
# only_local doesn't remove segments in the remote storage.
|
||||
def timeline_delete(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
|
||||
) -> Dict[Any, Any]:
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
params={
|
||||
"only_local": str(only_local).lower(),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
|
||||
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def get_metrics_str(self) -> str:
|
||||
request_result = self.get(f"http://localhost:{self.port}/metrics")
|
||||
request_result.raise_for_status()
|
||||
return request_result.text
|
||||
|
||||
def get_metrics(self) -> SafekeeperMetrics:
|
||||
all_metrics_text = self.get_metrics_str()
|
||||
|
||||
metrics = SafekeeperMetrics()
|
||||
for match in re.finditer(
|
||||
r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE,
|
||||
):
|
||||
metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
|
||||
match.group(3)
|
||||
)
|
||||
for match in re.finditer(
|
||||
r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE,
|
||||
):
|
||||
metrics.commit_lsn_inexact[
|
||||
(TenantId(match.group(1)), TimelineId(match.group(2)))
|
||||
] = int(match.group(3))
|
||||
return metrics
|
||||
11
test_runner/fixtures/safekeeper/utils.py
Normal file
11
test_runner/fixtures/safekeeper/utils.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.safekeeper.http import SafekeeperHttpClient
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
|
||||
|
||||
def are_walreceivers_absent(
|
||||
sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId
|
||||
):
|
||||
status = sk_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
|
||||
return len(status.walreceivers) == 0
|
||||
@@ -1,4 +1,3 @@
|
||||
import threading
|
||||
from typing import Optional
|
||||
|
||||
from fixtures.log_helper import log
|
||||
@@ -12,10 +11,6 @@ from fixtures.neon_fixtures import (
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
|
||||
# neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
|
||||
# to ensure we don't do that: this enables running lots of Workloads in parallel safely.
|
||||
ENDPOINT_LOCK = threading.Lock()
|
||||
|
||||
|
||||
class Workload:
|
||||
"""
|
||||
@@ -46,30 +41,17 @@ class Workload:
|
||||
|
||||
self._endpoint: Optional[Endpoint] = None
|
||||
|
||||
def reconfigure(self):
|
||||
"""
|
||||
Request the endpoint to reconfigure based on location reported by storage controller
|
||||
"""
|
||||
if self._endpoint is not None:
|
||||
with ENDPOINT_LOCK:
|
||||
self._endpoint.reconfigure()
|
||||
|
||||
def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
|
||||
# We may be running alongside other Workloads for different tenants. Full TTID is
|
||||
# obnoxiously long for use here, but a cut-down version is still unique enough for tests.
|
||||
endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}"
|
||||
|
||||
with ENDPOINT_LOCK:
|
||||
if self._endpoint is None:
|
||||
self._endpoint = self.env.endpoints.create(
|
||||
self.branch_name,
|
||||
tenant_id=self.tenant_id,
|
||||
pageserver_id=pageserver_id,
|
||||
endpoint_id=endpoint_id,
|
||||
)
|
||||
self._endpoint.start(pageserver_id=pageserver_id)
|
||||
else:
|
||||
self._endpoint.reconfigure(pageserver_id=pageserver_id)
|
||||
if self._endpoint is None:
|
||||
self._endpoint = self.env.endpoints.create(
|
||||
self.branch_name,
|
||||
tenant_id=self.tenant_id,
|
||||
pageserver_id=pageserver_id,
|
||||
endpoint_id="ep-workload",
|
||||
)
|
||||
self._endpoint.start(pageserver_id=pageserver_id)
|
||||
else:
|
||||
self._endpoint.reconfigure(pageserver_id=pageserver_id)
|
||||
|
||||
connstring = self._endpoint.safe_psql(
|
||||
"SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
|
||||
@@ -112,7 +94,7 @@ class Workload:
|
||||
else:
|
||||
return False
|
||||
|
||||
def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest=True):
|
||||
def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
|
||||
assert self.expect_rows >= n
|
||||
|
||||
max_iters = 10
|
||||
@@ -150,28 +132,22 @@ class Workload:
|
||||
]
|
||||
)
|
||||
|
||||
if ingest:
|
||||
# Wait for written data to be ingested by the pageserver
|
||||
for tenant_shard_id, pageserver in tenant_get_shards(
|
||||
self.env, self.tenant_id, pageserver_id
|
||||
):
|
||||
last_flush_lsn = wait_for_last_flush_lsn(
|
||||
self.env,
|
||||
endpoint,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
pageserver_id=pageserver_id,
|
||||
)
|
||||
ps_http = pageserver.http_client()
|
||||
wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
|
||||
for tenant_shard_id, pageserver in tenant_get_shards(
|
||||
self.env, self.tenant_id, pageserver_id
|
||||
):
|
||||
last_flush_lsn = wait_for_last_flush_lsn(
|
||||
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
|
||||
)
|
||||
ps_http = pageserver.http_client()
|
||||
wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
|
||||
|
||||
if upload:
|
||||
# Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
|
||||
ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
|
||||
wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
|
||||
log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
|
||||
else:
|
||||
log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
|
||||
if upload:
|
||||
# force a checkpoint to trigger upload
|
||||
ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
|
||||
wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
|
||||
log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
|
||||
else:
|
||||
log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
|
||||
|
||||
def validate(self, pageserver_id: Optional[int] = None):
|
||||
endpoint = self.endpoint(pageserver_id)
|
||||
|
||||
@@ -56,7 +56,7 @@ def setup_env(
|
||||
template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
|
||||
env.pageserver.tenant_detach(template_tenant)
|
||||
env.pageserver.allowed_errors.append(
|
||||
# tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
|
||||
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
|
||||
".*Dropped remote consistent LSN updates.*",
|
||||
)
|
||||
env.pageserver.tenant_attach(template_tenant, config)
|
||||
|
||||
@@ -92,7 +92,7 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
|
||||
template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
|
||||
env.pageserver.tenant_detach(template_tenant)
|
||||
env.pageserver.allowed_errors.append(
|
||||
# tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
|
||||
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
|
||||
".*Dropped remote consistent LSN updates.*",
|
||||
)
|
||||
env.pageserver.tenant_attach(template_tenant, config)
|
||||
|
||||
@@ -114,7 +114,7 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
|
||||
template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
|
||||
env.pageserver.tenant_detach(template_tenant)
|
||||
env.pageserver.allowed_errors.append(
|
||||
# tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
|
||||
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
|
||||
".*Dropped remote consistent LSN updates.*",
|
||||
)
|
||||
env.pageserver.tenant_attach(template_tenant, config)
|
||||
|
||||
@@ -56,12 +56,12 @@ def measure_recovery_time(env: NeonCompare):
|
||||
# Delete the Tenant in the pageserver: this will drop local and remote layers, such that
|
||||
# when we "create" the Tenant again, we will replay the WAL from the beginning.
|
||||
#
|
||||
# This is a "weird" thing to do, and can confuse the attachment service as we're re-using
|
||||
# This is a "weird" thing to do, and can confuse the storage controller as we're re-using
|
||||
# the same tenant ID for a tenant that is logically different from the pageserver's point
|
||||
# of view, but the same as far as the safekeeper/WAL is concerned. To work around that,
|
||||
# we will explicitly create the tenant in the same generation that it was previously
|
||||
# attached in.
|
||||
attach_status = env.env.attachment_service.inspect(tenant_shard_id=env.tenant)
|
||||
attach_status = env.env.storage_controller.inspect(tenant_shard_id=env.tenant)
|
||||
assert attach_status is not None
|
||||
(attach_gen, _) = attach_status
|
||||
|
||||
|
||||
@@ -137,7 +137,7 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
|
||||
ps_http.tenant_detach(tenant_id)
|
||||
assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
|
||||
|
||||
body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)}
|
||||
body = {"generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)}
|
||||
|
||||
ps_http.post(
|
||||
f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
|
||||
|
||||
@@ -85,9 +85,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
||||
# the endpoint. Whereas the previous reconfiguration was like a healthy migration, this
|
||||
# is more like what happens in an unexpected pageserver failure.
|
||||
#
|
||||
# Since we're dual-attached, need to tip-off attachment service to treat the one we're
|
||||
# Since we're dual-attached, need to tip-off storage controller to treat the one we're
|
||||
# about to start as the attached pageserver
|
||||
env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
|
||||
env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
|
||||
env.pageservers[0].start()
|
||||
env.pageservers[1].stop()
|
||||
|
||||
@@ -97,9 +97,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
||||
assert fetchone() == (100000,)
|
||||
|
||||
env.pageservers[0].stop()
|
||||
# Since we're dual-attached, need to tip-off attachment service to treat the one we're
|
||||
# Since we're dual-attached, need to tip-off storage controller to treat the one we're
|
||||
# about to start as the attached pageserver
|
||||
env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
|
||||
env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
|
||||
env.pageservers[1].start()
|
||||
|
||||
# Test a (former) bug where a child process spins without updating its connection string
|
||||
|
||||
@@ -133,7 +133,7 @@ def test_create_snapshot(
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
env.pageserver.stop()
|
||||
env.attachment_service.stop()
|
||||
env.storage_controller.stop()
|
||||
|
||||
# Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
|
||||
compatibility_snapshot_dir = (
|
||||
@@ -242,7 +242,7 @@ def test_forward_compatibility(
|
||||
# everything else: our test code is written for latest CLI args.
|
||||
env.neon_local_binpath = neon_local_binpath
|
||||
|
||||
neon_env_builder.start()
|
||||
neon_env_builder.start(register_pageservers=True)
|
||||
|
||||
check_neon_works(
|
||||
env,
|
||||
|
||||
@@ -4,12 +4,11 @@ import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
flush_ep_to_pageserver,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.pageserver.utils import wait_for_upload
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
from fixtures.types import Lsn
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
# Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
|
||||
@@ -46,14 +45,15 @@ def test_basic_eviction(
|
||||
FROM generate_series(1, 5000000) g
|
||||
"""
|
||||
)
|
||||
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
|
||||
wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
|
||||
# stops the endpoint
|
||||
current_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(client, tenant_id, timeline_id, current_lsn)
|
||||
|
||||
# disable compute & sks to avoid on-demand downloads by walreceiver / getpage
|
||||
endpoint.stop()
|
||||
# stop sks to avoid on-demand downloads by walreceiver / getpage; endpoint
|
||||
# has already been stopped by flush_ep_to_pageserver
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import time
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
|
||||
from fixtures.pageserver.types import (
|
||||
DeltaLayerFileName,
|
||||
ImageLayerFileName,
|
||||
@@ -115,8 +115,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
== 0
|
||||
)
|
||||
|
||||
endpoint.stop()
|
||||
last_record_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
|
||||
|
||||
@@ -160,7 +159,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
||||
time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites
|
||||
|
||||
def get_generation_number():
|
||||
attachment = env.attachment_service.inspect(tenant_id)
|
||||
attachment = env.storage_controller.inspect(tenant_id)
|
||||
assert attachment is not None
|
||||
return attachment[0]
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):
|
||||
|
||||
endpoint.wait_for_migrations()
|
||||
|
||||
num_migrations = 8
|
||||
num_migrations = 9
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("SELECT id FROM neon_migration.migration_id")
|
||||
|
||||
@@ -133,7 +133,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
|
||||
# Stop default ps/sk
|
||||
env.neon_cli.pageserver_stop(env.pageserver.id)
|
||||
env.neon_cli.safekeeper_stop()
|
||||
env.neon_cli.attachment_service_stop(False)
|
||||
env.neon_cli.storage_controller_stop(False)
|
||||
|
||||
# Keep NeonEnv state up to date, it usually owns starting/stopping services
|
||||
env.pageserver.running = False
|
||||
@@ -175,7 +175,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
|
||||
env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2)
|
||||
|
||||
# Stop this to get out of the way of the following `start`
|
||||
env.neon_cli.attachment_service_stop(False)
|
||||
env.neon_cli.storage_controller_stop(False)
|
||||
|
||||
# Default start
|
||||
res = env.neon_cli.raw_cli(["start"])
|
||||
|
||||
@@ -29,3 +29,34 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
|
||||
log.info(res)
|
||||
assert len(res) == 1
|
||||
assert len(res[0]) == 5
|
||||
|
||||
|
||||
# Verify that the neon extension can be upgraded/downgraded.
|
||||
def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
env.neon_cli.create_branch("test_neon_extension_compatibility")
|
||||
|
||||
endpoint_main = env.endpoints.create("test_neon_extension_compatibility")
|
||||
# don't skip pg_catalog updates - it runs CREATE EXTENSION neon
|
||||
endpoint_main.respec(skip_pg_catalog_updates=False)
|
||||
endpoint_main.start()
|
||||
|
||||
with closing(endpoint_main.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
all_versions = ["1.3", "1.2", "1.1", "1.0"]
|
||||
current_version = "1.3"
|
||||
for idx, begin_version in enumerate(all_versions):
|
||||
for target_version in all_versions[idx + 1 :]:
|
||||
if current_version != begin_version:
|
||||
cur.execute(
|
||||
f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {current_version}->{begin_version}"
|
||||
)
|
||||
current_version = begin_version
|
||||
# downgrade
|
||||
cur.execute(
|
||||
f"ALTER EXTENSION neon UPDATE TO '{target_version}'; -- {begin_version}->{target_version}"
|
||||
)
|
||||
# upgrade
|
||||
cur.execute(
|
||||
f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}"
|
||||
)
|
||||
|
||||
@@ -8,6 +8,7 @@ from typing import Any, DefaultDict, Dict, Tuple
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
flush_ep_to_pageserver,
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
@@ -517,7 +518,7 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("update a set id = -id")
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
|
||||
@@ -73,7 +73,7 @@ def check_client(env: NeonEnv, client: PageserverHttpClient):
|
||||
# create new tenant and check it is also there
|
||||
tenant_id = TenantId.generate()
|
||||
client.tenant_create(
|
||||
tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
|
||||
tenant_id, generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)
|
||||
)
|
||||
assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
|
||||
|
||||
|
||||
@@ -203,7 +203,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
env.broker.try_start()
|
||||
for sk in env.safekeepers:
|
||||
sk.start()
|
||||
env.attachment_service.start()
|
||||
env.storage_controller.start()
|
||||
|
||||
# We will start a pageserver with no control_plane_api set, so it won't be able to self-register
|
||||
env.storage_controller.node_register(env.pageserver)
|
||||
|
||||
env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
|
||||
|
||||
@@ -285,7 +288,7 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
|
||||
attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
|
||||
attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"]
|
||||
main_pageserver = env.get_pageserver(attached_to_id)
|
||||
other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
|
||||
|
||||
@@ -310,7 +313,7 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Now advance the generation in the control plane: subsequent validations
|
||||
# from the running pageserver will fail. No more deletions should happen.
|
||||
env.attachment_service.attach_hook_issue(env.initial_tenant, other_pageserver.id)
|
||||
env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id)
|
||||
generate_uploads_and_deletions(env, init=False, pageserver=main_pageserver)
|
||||
|
||||
assert_deletion_queue(ps_http, lambda n: n > 0)
|
||||
@@ -366,7 +369,7 @@ def test_deletion_queue_recovery(
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
|
||||
attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
|
||||
attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"]
|
||||
main_pageserver = env.get_pageserver(attached_to_id)
|
||||
other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
|
||||
|
||||
@@ -428,7 +431,7 @@ def test_deletion_queue_recovery(
|
||||
|
||||
if keep_attachment == KeepAttachment.LOSE:
|
||||
some_other_pageserver = other_pageserver.id
|
||||
env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
|
||||
env.storage_controller.attach_hook_issue(env.initial_tenant, some_other_pageserver)
|
||||
|
||||
main_pageserver.start()
|
||||
|
||||
@@ -494,7 +497,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
)
|
||||
|
||||
# Simulate a major incident: the control plane goes offline
|
||||
env.attachment_service.stop()
|
||||
env.storage_controller.stop()
|
||||
|
||||
# Remember how many validations had happened before the control plane went offline
|
||||
validated = get_deletion_queue_validated(ps_http)
|
||||
@@ -511,7 +514,6 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
|
||||
env.pageserver.start(
|
||||
overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
|
||||
register=False,
|
||||
)
|
||||
|
||||
# The pageserver should provide service to clients
|
||||
@@ -525,7 +527,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
assert get_deletion_queue_executed(ps_http) == 0
|
||||
|
||||
# When the control plane comes back up, normal service should resume
|
||||
env.attachment_service.start()
|
||||
env.storage_controller.start()
|
||||
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
assert get_deletion_queue_depth(ps_http) == 0
|
||||
|
||||
@@ -157,7 +157,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
|
||||
workload.churn_rows(rng.randint(128, 256), pageserver.id)
|
||||
workload.validate(pageserver.id)
|
||||
elif last_state_ps[0].startswith("Attached"):
|
||||
# The `attachment_service` will only re-attach on startup when a pageserver was the
|
||||
# The `storage_controller` will only re-attach on startup when a pageserver was the
|
||||
# holder of the latest generation: otherwise the pageserver will revert to detached
|
||||
# state if it was running attached with a stale generation
|
||||
last_state[pageserver.id] = ("Detached", None)
|
||||
@@ -182,12 +182,12 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
|
||||
generation = last_state_ps[1]
|
||||
else:
|
||||
# Switch generations, while also jumping between attached states
|
||||
generation = env.attachment_service.attach_hook_issue(
|
||||
generation = env.storage_controller.attach_hook_issue(
|
||||
tenant_id, pageserver.id
|
||||
)
|
||||
latest_attached = pageserver.id
|
||||
else:
|
||||
generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id)
|
||||
generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver.id)
|
||||
latest_attached = pageserver.id
|
||||
else:
|
||||
generation = None
|
||||
@@ -273,7 +273,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
|
||||
# Encourage the new location to download while still in secondary mode
|
||||
pageserver_b.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
|
||||
migrated_generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver_b.id)
|
||||
log.info(f"Acquired generation {migrated_generation} for destination pageserver")
|
||||
assert migrated_generation == initial_generation + 1
|
||||
|
||||
@@ -436,7 +436,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
assert env.attachment_service is not None
|
||||
assert env.storage_controller is not None
|
||||
assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
@@ -564,3 +564,35 @@ async def test_sql_over_http2(static_proxy: NeonProxy):
|
||||
"select 42 as answer", [], user="http", password="http", expected_code=200
|
||||
)
|
||||
assert resp["rows"] == [{"answer": 42}]
|
||||
|
||||
|
||||
def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy):
|
||||
static_proxy.safe_psql("create role http with login password 'http' superuser")
|
||||
|
||||
static_proxy.safe_psql("create table test_table ( id int primary key )")
|
||||
|
||||
# insert into a table, with a unique constraint, after sleeping for n seconds
|
||||
query = "WITH temp AS ( \
|
||||
SELECT pg_sleep($1) as sleep, $2::int as id \
|
||||
) INSERT INTO test_table (id) SELECT id FROM temp"
|
||||
|
||||
# expect to fail with timeout
|
||||
res = static_proxy.http_query(
|
||||
query,
|
||||
[static_proxy.http_timeout_seconds + 1, 1],
|
||||
user="http",
|
||||
password="http",
|
||||
expected_code=400,
|
||||
)
|
||||
assert "Query cancelled, runtime exceeded" in res["message"], "HTTP query should time out"
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200)
|
||||
assert res["command"] == "INSERT", "HTTP query should insert"
|
||||
assert res["rowCount"] == 1, "HTTP query should insert"
|
||||
|
||||
res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400)
|
||||
assert (
|
||||
"duplicate key value violates unique constraint" in res["message"]
|
||||
), "HTTP query should conflict"
|
||||
|
||||
@@ -169,7 +169,7 @@ def test_remote_storage_backup_and_restore(
|
||||
# Ensure that even though the tenant is broken, retrying the attachment fails
|
||||
with pytest.raises(Exception, match="Tenant state is Broken"):
|
||||
# Use same generation as in previous attempt
|
||||
gen_state = env.attachment_service.inspect(tenant_id)
|
||||
gen_state = env.storage_controller.inspect(tenant_id)
|
||||
assert gen_state is not None
|
||||
generation = gen_state[0]
|
||||
env.pageserver.tenant_attach(tenant_id, generation=generation)
|
||||
@@ -355,7 +355,7 @@ def test_remote_storage_upload_queue_retries(
|
||||
env.pageserver.stop(immediate=True)
|
||||
env.endpoints.stop_all()
|
||||
|
||||
# We are about to forcibly drop local dirs. Attachment service will increment generation in re-attach before
|
||||
# We are about to forcibly drop local dirs. Storage controller will increment generation in re-attach before
|
||||
# we later increment when actually attaching it again, leading to skipping a generation and potentially getting
|
||||
# these warnings if there was a durable but un-executed deletion list at time of restart.
|
||||
env.pageserver.allowed_errors.extend(
|
||||
|
||||
@@ -80,7 +80,7 @@ def test_tenant_s3_restore(
|
||||
assert (
|
||||
ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
|
||||
), "tenant removed before we deletion was issued"
|
||||
env.attachment_service.attach_hook_drop(tenant_id)
|
||||
env.storage_controller.attach_hook_drop(tenant_id)
|
||||
|
||||
tenant_path = env.pageserver.tenant_dir(tenant_id)
|
||||
assert not tenant_path.exists()
|
||||
@@ -103,7 +103,7 @@ def test_tenant_s3_restore(
|
||||
tenant_id, timestamp=ts_before_deletion, done_if_after=ts_after_deletion
|
||||
)
|
||||
|
||||
generation = env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
|
||||
generation = env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)
|
||||
|
||||
ps_http.tenant_attach(tenant_id, generation=generation)
|
||||
env.pageserver.quiesce_tenants()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user