mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-16 12:40:36 +00:00
Compare commits
21 Commits
release-pr
...
jcsp/attac
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e22c5e7a9f | ||
|
|
71ff404e6c | ||
|
|
e9f7510abf | ||
|
|
3e214f91de | ||
|
|
9beceff829 | ||
|
|
2aeb3d49bb | ||
|
|
a5af60269d | ||
|
|
2c0f9b65a8 | ||
|
|
f15291b606 | ||
|
|
7d47363568 | ||
|
|
00060a5e8f | ||
|
|
c570c78816 | ||
|
|
150c3c79a5 | ||
|
|
79094c20bd | ||
|
|
73de3568d6 | ||
|
|
06b8131668 | ||
|
|
b16960b3a6 | ||
|
|
50997249dd | ||
|
|
914a06ada7 | ||
|
|
6ff6242827 | ||
|
|
48e373c147 |
22
Cargo.lock
generated
22
Cargo.lock
generated
@@ -256,6 +256,28 @@ dependencies = [
|
||||
"critical-section",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "attachment_service"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"control_plane",
|
||||
"futures",
|
||||
"hyper",
|
||||
"pageserver_api",
|
||||
"postgres_connection",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.1.0"
|
||||
|
||||
@@ -3,6 +3,7 @@ resolver = "2"
|
||||
members = [
|
||||
"compute_tools",
|
||||
"control_plane",
|
||||
"control_plane/attachment_service",
|
||||
"pageserver",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
|
||||
25
control_plane/attachment_service/Cargo.toml
Normal file
25
control_plane/attachment_service/Cargo.toml
Normal file
@@ -0,0 +1,25 @@
|
||||
[package]
|
||||
name = "attachment_service"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
hyper.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
reqwest.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
utils = { path = "../../libs/utils/" }
|
||||
control_plane = { path = ".." }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
108
control_plane/attachment_service/src/compute_hook.rs
Normal file
108
control_plane/attachment_service/src/compute_hook.rs
Normal file
@@ -0,0 +1,108 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use postgres_connection::parse_host_port;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
pub(super) struct ComputeHookTenant {
|
||||
shards: Vec<(ShardIndex, NodeId)>,
|
||||
}
|
||||
|
||||
impl ComputeHookTenant {
|
||||
pub(super) async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> anyhow::Result<()> {
|
||||
// Find the highest shard count and drop any shards that aren't
|
||||
// for that shard count.
|
||||
let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
|
||||
let Some(shard_count) = shard_count else {
|
||||
// No shards, nothing to do.
|
||||
tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
self.shards.retain(|(k, _v)| k.shard_count == shard_count);
|
||||
self.shards
|
||||
.sort_by_key(|(shard, _node_id)| shard.shard_number);
|
||||
|
||||
if self.shards.len() == shard_count.0 as usize {
|
||||
// We have pageservers for all the shards: proceed to reconfigure compute
|
||||
let env = LocalEnv::load_config().expect("Error loading config");
|
||||
let cplane = ComputeControlPlane::load(env.clone())
|
||||
.expect("Error loading compute control plane");
|
||||
|
||||
let compute_pageservers = self
|
||||
.shards
|
||||
.iter()
|
||||
.map(|(_shard, node_id)| {
|
||||
let ps_conf = env
|
||||
.get_pageserver_conf(*node_id)
|
||||
.expect("Unknown pageserver");
|
||||
let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
|
||||
.expect("Unable to parse listen_pg_addr");
|
||||
(pg_host, pg_port.unwrap_or(5432))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
||||
if endpoint.tenant_id == tenant_id && endpoint.status() == "running" {
|
||||
tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
|
||||
endpoint.reconfigure(compute_pageservers.clone()).await?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tracing::info!(
|
||||
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
|
||||
self.shards.len(),
|
||||
shard_count.0
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// The compute hook is a destination for notifications about changes to tenant:pageserver
|
||||
/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures
|
||||
/// the compute connection string.
|
||||
pub(super) struct ComputeHook {
|
||||
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
|
||||
}
|
||||
|
||||
impl ComputeHook {
|
||||
pub(super) fn new() -> Self {
|
||||
Self {
|
||||
state: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn notify(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::info!("ComputeHook::notify: {}->{}", tenant_shard_id, node_id);
|
||||
let mut locked = self.state.lock().await;
|
||||
let entry = locked
|
||||
.entry(tenant_shard_id.tenant_id)
|
||||
.or_insert_with(|| ComputeHookTenant { shards: Vec::new() });
|
||||
|
||||
let shard_index = ShardIndex {
|
||||
shard_count: tenant_shard_id.shard_count,
|
||||
shard_number: tenant_shard_id.shard_number,
|
||||
};
|
||||
|
||||
let mut set = false;
|
||||
for (existing_shard, existing_node) in &mut entry.shards {
|
||||
if *existing_shard == shard_index {
|
||||
*existing_node = node_id;
|
||||
set = true;
|
||||
}
|
||||
}
|
||||
if !set {
|
||||
entry.shards.push((shard_index, node_id));
|
||||
}
|
||||
|
||||
entry.maybe_reconfigure(tenant_shard_id.tenant_id).await
|
||||
}
|
||||
}
|
||||
195
control_plane/attachment_service/src/http.rs
Normal file
195
control_plane/attachment_service/src/http.rs
Normal file
@@ -0,0 +1,195 @@
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::Service;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response};
|
||||
use pageserver_api::models::{TenantCreateRequest, TenantShardSplitRequest, TimelineCreateRequest};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::sync::Arc;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::request::parse_request_param;
|
||||
use utils::id::TenantId;
|
||||
|
||||
use utils::{
|
||||
http::{
|
||||
endpoint::{self},
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
id::NodeId,
|
||||
};
|
||||
|
||||
use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
use control_plane::attachment_service::{
|
||||
AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
|
||||
TenantShardMigrateRequest,
|
||||
};
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
pub struct HttpState {
|
||||
service: Arc<crate::service::Service>,
|
||||
}
|
||||
|
||||
impl HttpState {
|
||||
pub fn new(service: Arc<crate::service::Service>) -> Self {
|
||||
Self { service }
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn get_state(request: &Request<Body>) -> &HttpState {
|
||||
request
|
||||
.data::<Arc<HttpState>>()
|
||||
.expect("unknown state type")
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
/// Pageserver calls into this on startup, to learn which tenants it should attach
|
||||
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.re_attach(reattach_req))
|
||||
}
|
||||
|
||||
/// Pageserver calls into this before doing deletions, to confirm that it still
|
||||
/// holds the latest generation for the tenants with deletions enqueued
|
||||
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.validate(validate_req))
|
||||
}
|
||||
|
||||
/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
|
||||
/// (in the real control plane this is unnecessary, because the same program is managing
|
||||
/// generation numbers and doing attachments).
|
||||
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.attach_hook(attach_req))
|
||||
}
|
||||
|
||||
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.inspect(inspect_req))
|
||||
}
|
||||
|
||||
async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.tenant_create(create_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
state.service.node_register(register_req);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
|
||||
if node_id != config_req.node_id {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Path and body node_id differ"
|
||||
)));
|
||||
}
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_split(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.tenant_shard_split(tenant_id, split_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.tenant_shard_migrate(tenant_shard_id, migrate_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
/// Status endpoint is just used for checking that our HTTP listener is up
|
||||
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
impl From<ReconcileError> for ApiError {
|
||||
fn from(value: ReconcileError) -> Self {
|
||||
ApiError::Conflict(format!("Reconciliation error: {}", value))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn make_router(service: Arc<Service>) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
endpoint::make_router()
|
||||
.data(Arc::new(HttpState { service }))
|
||||
.get("/status", |r| request_span(r, handle_status))
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
||||
.post("/inspect", |r| request_span(r, handle_inspect))
|
||||
.post("/node", |r| request_span(r, handle_node_register))
|
||||
.put("/node/:node_id/config", |r| {
|
||||
request_span(r, handle_node_configure)
|
||||
})
|
||||
.post("/tenant", |r| request_span(r, handle_tenant_create))
|
||||
.post("/tenant/:tenant_id/timeline", |r| {
|
||||
request_span(r, handle_tenant_timeline_create)
|
||||
})
|
||||
.get("/tenant/:tenant_id/locate", |r| {
|
||||
request_span(r, handle_tenant_locate)
|
||||
})
|
||||
.put("/tenant/:tenant_id/shard_split", |r| {
|
||||
request_span(r, handle_tenant_shard_split)
|
||||
})
|
||||
.put("/tenant/:tenant_shard_id/migrate", |r| {
|
||||
request_span(r, handle_tenant_shard_migrate)
|
||||
})
|
||||
}
|
||||
49
control_plane/attachment_service/src/lib.rs
Normal file
49
control_plane/attachment_service/src/lib.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
use utils::seqwait::MonotonicCounter;
|
||||
|
||||
mod compute_hook;
|
||||
pub mod http;
|
||||
mod node;
|
||||
mod reconciler;
|
||||
mod scheduler;
|
||||
pub mod service;
|
||||
mod tenant_state;
|
||||
|
||||
#[derive(Clone)]
|
||||
enum PlacementPolicy {
|
||||
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
||||
Single,
|
||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
||||
/// some number of secondaries.
|
||||
Double(usize),
|
||||
}
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
|
||||
struct Sequence(u64);
|
||||
|
||||
impl std::fmt::Display for Sequence {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl MonotonicCounter<Sequence> for Sequence {
|
||||
fn cnt_advance(&mut self, v: Sequence) {
|
||||
assert!(*self <= v);
|
||||
*self = v;
|
||||
}
|
||||
fn cnt_value(&self) -> Sequence {
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
impl Sequence {
|
||||
fn next(&self) -> Sequence {
|
||||
Sequence(self.0 + 1)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for PlacementPolicy {
|
||||
fn default() -> Self {
|
||||
PlacementPolicy::Double(1)
|
||||
}
|
||||
}
|
||||
65
control_plane/attachment_service/src/main.rs
Normal file
65
control_plane/attachment_service/src/main.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
/// The attachment service mimics the aspects of the control plane API
|
||||
/// that are required for a pageserver to operate.
|
||||
///
|
||||
/// This enables running & testing pageservers without a full-blown
|
||||
/// deployment of the Neon cloud platform.
|
||||
///
|
||||
use anyhow::anyhow;
|
||||
use attachment_service::http::make_router;
|
||||
use attachment_service::service::Service;
|
||||
use clap::Parser;
|
||||
use std::path::PathBuf;
|
||||
use utils::logging::{self, LogFormat};
|
||||
use utils::signals::{ShutdownSignals, Signal};
|
||||
|
||||
use utils::tcp_listener;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
#[command(arg_required_else_help(true))]
|
||||
struct Cli {
|
||||
/// Host and port to listen on, like `127.0.0.1:1234`
|
||||
#[arg(short, long)]
|
||||
listen: std::net::SocketAddr,
|
||||
|
||||
/// Path to the .json file to store state (will be created if it doesn't exist)
|
||||
#[arg(short, long)]
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
let args = Cli::parse();
|
||||
tracing::info!(
|
||||
"Starting, state at {}, listening on {}",
|
||||
args.path.to_string_lossy(),
|
||||
args.listen
|
||||
);
|
||||
|
||||
let service = Service::spawn();
|
||||
|
||||
let http_listener = tcp_listener::bind(args.listen)?;
|
||||
let router = make_router(service).build().map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
|
||||
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
|
||||
tokio::task::spawn(server);
|
||||
|
||||
ShutdownSignals::handle(|signal| match signal {
|
||||
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
|
||||
tracing::info!("Got {}. Terminating", signal.name());
|
||||
// We're just a test helper: no graceful shutdown.
|
||||
std::process::exit(0);
|
||||
}
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
40
control_plane/attachment_service/src/node.rs
Normal file
40
control_plane/attachment_service/src/node.rs
Normal file
@@ -0,0 +1,40 @@
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use utils::id::NodeId;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Node {
|
||||
pub(crate) id: NodeId,
|
||||
|
||||
pub(crate) availability: NodeAvailability,
|
||||
pub(crate) scheduling: NodeSchedulingPolicy,
|
||||
|
||||
pub(crate) listen_http_addr: String,
|
||||
pub(crate) listen_http_port: u16,
|
||||
|
||||
pub(crate) listen_pg_addr: String,
|
||||
pub(crate) listen_pg_port: u16,
|
||||
}
|
||||
|
||||
impl Node {
|
||||
pub(crate) fn base_url(&self) -> String {
|
||||
format!(
|
||||
"http://{}:{}/v1",
|
||||
self.listen_http_addr, self.listen_http_port
|
||||
)
|
||||
}
|
||||
|
||||
/// Is this node elegible to have work scheduled onto it?
|
||||
pub(crate) fn may_schedule(&self) -> bool {
|
||||
match self.availability {
|
||||
NodeAvailability::Active => {}
|
||||
NodeAvailability::Offline => return false,
|
||||
}
|
||||
|
||||
match self.scheduling {
|
||||
NodeSchedulingPolicy::Active => true,
|
||||
NodeSchedulingPolicy::Draining => false,
|
||||
NodeSchedulingPolicy::Filling => true,
|
||||
NodeSchedulingPolicy::Pause => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
478
control_plane/attachment_service/src/reconciler.rs
Normal file
478
control_plane/attachment_service/src/reconciler.rs
Normal file
@@ -0,0 +1,478 @@
|
||||
use control_plane::attachment_service::NodeAvailability;
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
use hyper::Method;
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
TenantLocationConfigRequest,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::compute_hook::ComputeHook;
|
||||
use crate::node::Node;
|
||||
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
|
||||
|
||||
/// Object with the lifetime of the background reconcile task that is created
|
||||
/// for tenants which have a difference between their intent and observed states.
|
||||
pub(super) struct Reconciler {
|
||||
/// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
|
||||
/// of a tenant's state from when we spawned a reconcile task.
|
||||
pub(super) tenant_shard_id: TenantShardId,
|
||||
pub(crate) shard: ShardIdentity,
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) intent: IntentState,
|
||||
pub(crate) config: TenantConfig,
|
||||
pub(crate) observed: ObservedState,
|
||||
|
||||
/// A snapshot of the pageservers as they were when we were asked
|
||||
/// to reconcile.
|
||||
pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
|
||||
/// A hook to notify the running postgres instances when we change the location
|
||||
/// of a tenant
|
||||
pub(crate) compute_hook: Arc<ComputeHook>,
|
||||
|
||||
/// A means to abort background reconciliation: it is essential to
|
||||
/// call this when something changes in the original TenantState that
|
||||
/// will make this reconciliation impossible or unnecessary, for
|
||||
/// example when a pageserver node goes offline, or the PlacementPolicy for
|
||||
/// the tenant is changed.
|
||||
pub(crate) cancel: CancellationToken,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ReconcileError {
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl Reconciler {
|
||||
async fn location_config(
|
||||
&mut self,
|
||||
node_id: NodeId,
|
||||
config: LocationConfig,
|
||||
) -> anyhow::Result<()> {
|
||||
let node = self
|
||||
.pageservers
|
||||
.get(&node_id)
|
||||
.expect("Pageserver may not be removed while referenced");
|
||||
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.id, ObservedStateLocation { conf: None });
|
||||
|
||||
let configure_request = TenantLocationConfigRequest {
|
||||
tenant_id: self.tenant_shard_id,
|
||||
config: config.clone(),
|
||||
};
|
||||
|
||||
let client = Client::new();
|
||||
let response = client
|
||||
.request(
|
||||
Method::PUT,
|
||||
format!(
|
||||
"{}/tenant/{}/location_config",
|
||||
node.base_url(),
|
||||
self.tenant_shard_id
|
||||
),
|
||||
)
|
||||
.json(&configure_request)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.id, ObservedStateLocation { conf: Some(config) });
|
||||
|
||||
response.error_for_status()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> {
|
||||
let destination = if let Some(node_id) = self.intent.attached {
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) => {
|
||||
// We will do a live migration only if the intended destination is not
|
||||
// currently in an attached state.
|
||||
match &conf.conf {
|
||||
Some(conf) if conf.mode == LocationConfigMode::Secondary => {
|
||||
// Fall through to do a live migration
|
||||
node_id
|
||||
}
|
||||
None | Some(_) => {
|
||||
// Attached or uncertain: don't do a live migration, proceed
|
||||
// with a general-case reconciliation
|
||||
tracing::info!("maybe_live_migrate: destination is None or attached");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Our destination is not attached: maybe live migrate if some other
|
||||
// node is currently attached. Fall through.
|
||||
node_id
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No intent to be attached
|
||||
tracing::info!("maybe_live_migrate: no attached intent");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let mut origin = None;
|
||||
for (node_id, state) in &self.observed.locations {
|
||||
if let Some(observed_conf) = &state.conf {
|
||||
if observed_conf.mode == LocationConfigMode::AttachedSingle {
|
||||
let node = self
|
||||
.pageservers
|
||||
.get(node_id)
|
||||
.expect("Nodes may not be removed while referenced");
|
||||
// We will only attempt live migration if the origin is not offline: this
|
||||
// avoids trying to do it while reconciling after responding to an HA failover.
|
||||
if !matches!(node.availability, NodeAvailability::Offline) {
|
||||
origin = Some(*node_id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let Some(origin) = origin else {
|
||||
tracing::info!("maybe_live_migrate: no origin found");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
// We have an origin and a destination: proceed to do the live migration
|
||||
let env = LocalEnv::load_config().expect("Error loading config");
|
||||
let origin_ps = PageServerNode::from_env(
|
||||
&env,
|
||||
env.get_pageserver_conf(origin)
|
||||
.expect("Conf missing pageserver"),
|
||||
);
|
||||
let destination_ps = PageServerNode::from_env(
|
||||
&env,
|
||||
env.get_pageserver_conf(destination)
|
||||
.expect("Conf missing pageserver"),
|
||||
);
|
||||
|
||||
tracing::info!(
|
||||
"Live migrating {}->{}",
|
||||
origin_ps.conf.id,
|
||||
destination_ps.conf.id
|
||||
);
|
||||
self.live_migrate(origin_ps, destination_ps).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn live_migrate(
|
||||
&mut self,
|
||||
origin_ps: PageServerNode,
|
||||
dest_ps: PageServerNode,
|
||||
) -> anyhow::Result<()> {
|
||||
// `maybe_live_migrate` is responsibble for sanity of inputs
|
||||
assert!(origin_ps.conf.id != dest_ps.conf.id);
|
||||
|
||||
fn build_location_config(
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
mode: LocationConfigMode,
|
||||
generation: Option<Generation>,
|
||||
secondary_conf: Option<LocationConfigSecondary>,
|
||||
) -> LocationConfig {
|
||||
LocationConfig {
|
||||
mode,
|
||||
generation: generation.map(|g| g.into().unwrap()),
|
||||
secondary_conf,
|
||||
tenant_conf: config.clone(),
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_lsns(
|
||||
tenant_shard_id: TenantShardId,
|
||||
pageserver: &PageServerNode,
|
||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
||||
let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
|
||||
Ok(timelines
|
||||
.into_iter()
|
||||
.map(|t| (t.timeline_id, t.last_record_lsn))
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn await_lsn(
|
||||
tenant_shard_id: TenantShardId,
|
||||
pageserver: &PageServerNode,
|
||||
baseline: HashMap<TimelineId, Lsn>,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let latest = match get_lsns(tenant_shard_id, pageserver).await {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
println!(
|
||||
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
|
||||
pageserver.conf.id
|
||||
);
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let mut any_behind: bool = false;
|
||||
for (timeline_id, baseline_lsn) in &baseline {
|
||||
match latest.get(timeline_id) {
|
||||
Some(latest_lsn) => {
|
||||
println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
|
||||
if latest_lsn < baseline_lsn {
|
||||
any_behind = true;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Expected timeline isn't yet visible on migration destination.
|
||||
// (IRL we would have to account for timeline deletion, but this
|
||||
// is just test helper)
|
||||
any_behind = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !any_behind {
|
||||
println!("✅ LSN caught up. Proceeding...");
|
||||
break;
|
||||
} else {
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"🔁 Switching origin pageserver {} to stale mode",
|
||||
origin_ps.conf.id
|
||||
);
|
||||
|
||||
// FIXME: it is incorrect to use self.generation here, we should use the generation
|
||||
// from the ObservedState of the origin pageserver (it might be older than self.generation)
|
||||
let stale_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedStale,
|
||||
Some(self.generation),
|
||||
None,
|
||||
);
|
||||
origin_ps
|
||||
.location_config(
|
||||
self.tenant_shard_id,
|
||||
stale_conf,
|
||||
Some(Duration::from_secs(10)),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let baseline_lsns = Some(get_lsns(self.tenant_shard_id, &origin_ps).await?);
|
||||
|
||||
// Increment generation before attaching to new pageserver
|
||||
self.generation = self.generation.next();
|
||||
|
||||
let dest_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedMulti,
|
||||
Some(self.generation),
|
||||
None,
|
||||
);
|
||||
|
||||
tracing::info!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
|
||||
dest_ps
|
||||
.location_config(self.tenant_shard_id, dest_conf, None)
|
||||
.await?;
|
||||
|
||||
if let Some(baseline) = baseline_lsns {
|
||||
tracing::info!("🕑 Waiting for LSN to catch up...");
|
||||
await_lsn(self.tenant_shard_id, &dest_ps, baseline).await?;
|
||||
}
|
||||
|
||||
tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps.conf.id);
|
||||
self.compute_hook
|
||||
.notify(self.tenant_shard_id, dest_ps.conf.id)
|
||||
.await?;
|
||||
|
||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then
|
||||
// this location will be deleted in the general case reconciliation that runs after this.
|
||||
let origin_secondary_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::Secondary,
|
||||
None,
|
||||
Some(LocationConfigSecondary { warm: true }),
|
||||
);
|
||||
origin_ps
|
||||
.location_config(self.tenant_shard_id, origin_secondary_conf.clone(), None)
|
||||
.await?;
|
||||
// TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
|
||||
// partway through. In fact, all location conf API calls should be in a wrapper that sets
|
||||
// the observed state to None, then runs, then sets it to what we wrote.
|
||||
self.observed.locations.insert(
|
||||
origin_ps.conf.id,
|
||||
ObservedStateLocation {
|
||||
conf: Some(origin_secondary_conf),
|
||||
},
|
||||
);
|
||||
|
||||
println!(
|
||||
"🔁 Switching to AttachedSingle mode on pageserver {}",
|
||||
dest_ps.conf.id
|
||||
);
|
||||
let dest_final_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedSingle,
|
||||
Some(self.generation),
|
||||
None,
|
||||
);
|
||||
dest_ps
|
||||
.location_config(self.tenant_shard_id, dest_final_conf.clone(), None)
|
||||
.await?;
|
||||
self.observed.locations.insert(
|
||||
dest_ps.conf.id,
|
||||
ObservedStateLocation {
|
||||
conf: Some(dest_final_conf),
|
||||
},
|
||||
);
|
||||
|
||||
println!("✅ Migration complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reconciling a tenant makes API calls to pageservers until the observed state
|
||||
/// matches the intended state.
|
||||
///
|
||||
/// First we apply special case handling (e.g. for live migrations), and then a
|
||||
/// general case reconciliation where we walk through the intent by pageserver
|
||||
/// and call out to the pageserver to apply the desired state.
|
||||
pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
|
||||
// TODO: if any of self.observed is None, call to remote pageservers
|
||||
// to learn correct state.
|
||||
|
||||
// Special case: live migration
|
||||
self.maybe_live_migrate().await?;
|
||||
|
||||
// If the attached pageserver is not attached, do so now.
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
let mut wanted_conf =
|
||||
attached_location_conf(self.generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
tracing::info!("Observed configuration already correct.")
|
||||
}
|
||||
Some(_) | None => {
|
||||
// If there is no observed configuration, or if its value does not equal our intent, then we must call out to the pageserver.
|
||||
self.generation = self.generation.next();
|
||||
wanted_conf.generation = self.generation.into();
|
||||
tracing::info!("Observed configuration requires update.");
|
||||
self.location_config(node_id, wanted_conf).await?;
|
||||
if let Err(e) = self
|
||||
.compute_hook
|
||||
.notify(self.tenant_shard_id, node_id)
|
||||
.await
|
||||
{
|
||||
tracing::warn!(
|
||||
"Failed to notify compute of newly attached pageserver {node_id}: {e}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Configure secondary locations: if these were previously attached this
|
||||
// implicitly downgrades them from attached to secondary.
|
||||
let mut changes = Vec::new();
|
||||
for node_id in &self.intent.secondary {
|
||||
let wanted_conf = secondary_location_conf(&self.shard, &self.config);
|
||||
match self.observed.locations.get(node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
tracing::info!(%node_id, "Observed configuration already correct.")
|
||||
}
|
||||
Some(_) | None => {
|
||||
// If there is no observed configuration, or if its value does not equal our intent, then we must call out to the pageserver.
|
||||
tracing::info!(%node_id, "Observed configuration requires update.");
|
||||
changes.push((*node_id, wanted_conf))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Detach any extraneous pageservers that are no longer referenced
|
||||
// by our intent.
|
||||
let all_pageservers = self.intent.all_pageservers();
|
||||
for node_id in self.observed.locations.keys() {
|
||||
if all_pageservers.contains(node_id) {
|
||||
// We are only detaching pageservers that aren't used at all.
|
||||
continue;
|
||||
}
|
||||
|
||||
changes.push((
|
||||
*node_id,
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::Detached,
|
||||
generation: None,
|
||||
secondary_conf: None,
|
||||
shard_number: self.shard.number.0,
|
||||
shard_count: self.shard.count.0,
|
||||
shard_stripe_size: self.shard.stripe_size.0,
|
||||
tenant_conf: self.config.clone(),
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
for (node_id, conf) in changes {
|
||||
self.location_config(node_id, conf).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn attached_location_conf(
|
||||
generation: Generation,
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
) -> LocationConfig {
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::AttachedSingle,
|
||||
generation: generation.into(),
|
||||
secondary_conf: None,
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn secondary_location_conf(
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
) -> LocationConfig {
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::Secondary,
|
||||
generation: None,
|
||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
}
|
||||
89
control_plane/attachment_service/src/scheduler.rs
Normal file
89
control_plane/attachment_service/src/scheduler.rs
Normal file
@@ -0,0 +1,89 @@
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
|
||||
use crate::{node::Node, tenant_state::TenantState};
|
||||
|
||||
/// Scenarios in which we cannot find a suitable location for a tenant shard
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ScheduleError {
|
||||
#[error("No pageservers found")]
|
||||
NoPageservers,
|
||||
#[error("No pageserver found matching constraint")]
|
||||
ImpossibleConstraint,
|
||||
}
|
||||
|
||||
impl From<ScheduleError> for ApiError {
|
||||
fn from(value: ScheduleError) -> Self {
|
||||
ApiError::Conflict(format!("Scheduling error: {}", value))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct Scheduler {
|
||||
tenant_counts: HashMap<NodeId, usize>,
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub(crate) fn new(
|
||||
tenants: &BTreeMap<TenantShardId, TenantState>,
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
) -> Self {
|
||||
let mut tenant_counts = HashMap::new();
|
||||
for node_id in nodes.keys() {
|
||||
tenant_counts.insert(*node_id, 0);
|
||||
}
|
||||
|
||||
for tenant in tenants.values() {
|
||||
if let Some(ps) = tenant.intent.attached {
|
||||
let entry = tenant_counts.entry(ps).or_insert(0);
|
||||
*entry += 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (node_id, node) in nodes {
|
||||
if !node.may_schedule() {
|
||||
tenant_counts.remove(node_id);
|
||||
}
|
||||
}
|
||||
|
||||
Self { tenant_counts }
|
||||
}
|
||||
|
||||
pub(crate) fn schedule_shard(
|
||||
&mut self,
|
||||
hard_exclude: &[NodeId],
|
||||
) -> Result<NodeId, ScheduleError> {
|
||||
if self.tenant_counts.is_empty() {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
|
||||
let mut tenant_counts: Vec<(NodeId, usize)> = self
|
||||
.tenant_counts
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) {
|
||||
None
|
||||
} else {
|
||||
Some((*k, *v))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by tenant count. Nodes with the same tenant count are sorted by ID.
|
||||
tenant_counts.sort_by_key(|i| (i.1, i.0));
|
||||
|
||||
if tenant_counts.is_empty() {
|
||||
// After applying constraints, no pageservers were left
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
|
||||
for (node_id, count) in &tenant_counts {
|
||||
tracing::info!("tenant_counts[{node_id}]={count}");
|
||||
}
|
||||
|
||||
let node_id = tenant_counts.first().unwrap().0;
|
||||
tracing::info!("scheduler selected node {node_id}");
|
||||
*self.tenant_counts.get_mut(&node_id).unwrap() += 1;
|
||||
Ok(node_id)
|
||||
}
|
||||
}
|
||||
922
control_plane/attachment_service/src/service.rs
Normal file
922
control_plane/attachment_service/src/service.rs
Normal file
@@ -0,0 +1,922 @@
|
||||
use std::{
|
||||
collections::{BTreeMap, HashMap},
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use control_plane::attachment_service::{
|
||||
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability,
|
||||
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse,
|
||||
TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
use hyper::Method;
|
||||
use pageserver_api::{
|
||||
control_api::{
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
|
||||
ValidateResponse, ValidateResponseTenant,
|
||||
},
|
||||
models::{
|
||||
ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId},
|
||||
};
|
||||
use reqwest::Client;
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
http::error::ApiError,
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
compute_hook::ComputeHook,
|
||||
node::Node,
|
||||
reconciler::attached_location_conf,
|
||||
scheduler::Scheduler,
|
||||
tenant_state::{
|
||||
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcilerWaiter,
|
||||
TenantState,
|
||||
},
|
||||
PlacementPolicy,
|
||||
};
|
||||
|
||||
const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
|
||||
// Top level state available to all HTTP handlers
|
||||
struct ServiceState {
|
||||
tenants: BTreeMap<TenantShardId, TenantState>,
|
||||
|
||||
nodes: Arc<HashMap<NodeId, Node>>,
|
||||
|
||||
compute_hook: Arc<ComputeHook>,
|
||||
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
}
|
||||
|
||||
impl ServiceState {
|
||||
fn new(result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>) -> Self {
|
||||
Self {
|
||||
tenants: BTreeMap::new(),
|
||||
nodes: Arc::new(HashMap::new()),
|
||||
compute_hook: Arc::new(ComputeHook::new()),
|
||||
result_tx,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Service {
|
||||
inner: Arc<std::sync::RwLock<ServiceState>>,
|
||||
}
|
||||
|
||||
impl Service {
|
||||
pub fn spawn() -> Arc<Self> {
|
||||
let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
let this = Arc::new(Self {
|
||||
inner: Arc::new(std::sync::RwLock::new(ServiceState::new(result_tx))),
|
||||
});
|
||||
|
||||
let result_task_this = this.clone();
|
||||
tokio::task::spawn(async move {
|
||||
while let Some(result) = result_rx.recv().await {
|
||||
tracing::info!(
|
||||
"Reconcile result for sequence {}, ok={}",
|
||||
result.sequence,
|
||||
result.result.is_ok()
|
||||
);
|
||||
let mut locked = result_task_this.inner.write().unwrap();
|
||||
if let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) {
|
||||
tenant.generation = result.generation;
|
||||
match result.result {
|
||||
Ok(()) => {
|
||||
for (node_id, loc) in &result.observed.locations {
|
||||
if let Some(conf) = &loc.conf {
|
||||
tracing::info!(
|
||||
"Updating observed location {}: {:?}",
|
||||
node_id,
|
||||
conf
|
||||
);
|
||||
} else {
|
||||
tracing::info!("Setting observed location {} to None", node_id,)
|
||||
}
|
||||
}
|
||||
tenant.observed = result.observed;
|
||||
tenant.waiter.advance(result.sequence);
|
||||
}
|
||||
Err(e) => {
|
||||
// TODO: some observability, record on teh tenant its last reconcile error
|
||||
tracing::warn!(
|
||||
"Reconcile error on tenant {}: {}",
|
||||
tenant.tenant_shard_id,
|
||||
e
|
||||
);
|
||||
|
||||
for (node_id, o) in result.observed.locations {
|
||||
tenant.observed.locations.insert(node_id, o);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
this
|
||||
}
|
||||
|
||||
pub(crate) fn attach_hook(&self, attach_req: AttachHookRequest) -> AttachHookResponse {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
|
||||
let tenant_state = locked
|
||||
.tenants
|
||||
.entry(attach_req.tenant_shard_id)
|
||||
.or_insert_with(|| {
|
||||
TenantState::new(
|
||||
attach_req.tenant_shard_id,
|
||||
ShardIdentity::unsharded(),
|
||||
PlacementPolicy::Single,
|
||||
)
|
||||
});
|
||||
|
||||
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
|
||||
tenant_state.generation = tenant_state.generation.next();
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_shard_id,
|
||||
ps_id = %attaching_pageserver,
|
||||
generation = ?tenant_state.generation,
|
||||
"issuing",
|
||||
);
|
||||
} else if let Some(ps_id) = tenant_state.intent.attached {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_shard_id,
|
||||
%ps_id,
|
||||
generation = ?tenant_state.generation,
|
||||
"dropping",
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_shard_id,
|
||||
"no-op: tenant already has no pageserver");
|
||||
}
|
||||
tenant_state.intent.attached = attach_req.node_id;
|
||||
let generation = tenant_state.generation;
|
||||
|
||||
tracing::info!(
|
||||
"attach_hook: tenant {} set generation {:?}, pageserver {}",
|
||||
attach_req.tenant_shard_id,
|
||||
tenant_state.generation,
|
||||
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
|
||||
);
|
||||
|
||||
AttachHookResponse {
|
||||
gen: attach_req.node_id.map(|_| generation.into().unwrap()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse {
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id);
|
||||
|
||||
InspectResponse {
|
||||
attachment: tenant_state.and_then(|s| {
|
||||
s.intent
|
||||
.attached
|
||||
.map(|ps| (s.generation.into().unwrap(), ps))
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn re_attach(&self, reattach_req: ReAttachRequest) -> ReAttachResponse {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
|
||||
let mut response = ReAttachResponse {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
for (t, state) in &mut locked.tenants {
|
||||
if state.intent.attached == Some(reattach_req.node_id) {
|
||||
state.generation = state.generation.next();
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: *t,
|
||||
gen: state.generation.into().unwrap(),
|
||||
});
|
||||
}
|
||||
}
|
||||
response
|
||||
}
|
||||
|
||||
pub(crate) fn validate(&self, validate_req: ValidateRequest) -> ValidateResponse {
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
let mut response = ValidateResponse {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
|
||||
for req_tenant in validate_req.tenants {
|
||||
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
|
||||
let valid = tenant_state.generation == Generation::new(req_tenant.gen);
|
||||
tracing::info!(
|
||||
"handle_validate: {}(gen {}): valid={valid} (latest {:?})",
|
||||
req_tenant.id,
|
||||
req_tenant.gen,
|
||||
tenant_state.generation
|
||||
);
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: req_tenant.id,
|
||||
valid,
|
||||
});
|
||||
}
|
||||
}
|
||||
response
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_create(
|
||||
&self,
|
||||
create_req: TenantCreateRequest,
|
||||
) -> Result<TenantCreateResponse, ApiError> {
|
||||
let (waiters, response_shards) = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
|
||||
tracing::info!(
|
||||
"Creating tenant {}, shard_count={:?}, have {} pageservers",
|
||||
create_req.new_tenant_id,
|
||||
create_req.shard_parameters.count,
|
||||
locked.nodes.len()
|
||||
);
|
||||
|
||||
// This service expects to handle sharding itself: it is an error to try and directly create
|
||||
// a particular shard here.
|
||||
let tenant_id = if create_req.new_tenant_id.shard_count > ShardCount(1) {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Attempted to create a specific shard, this API is for creating the whole tenant"
|
||||
)));
|
||||
} else {
|
||||
create_req.new_tenant_id.tenant_id
|
||||
};
|
||||
|
||||
// Shard count 0 is valid: it means create a single shard (ShardCount(0) means "unsharded")
|
||||
let literal_shard_count = if create_req.shard_parameters.is_unsharded() {
|
||||
1
|
||||
} else {
|
||||
create_req.shard_parameters.count.0
|
||||
};
|
||||
|
||||
let mut response_shards = Vec::new();
|
||||
|
||||
let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
|
||||
|
||||
for i in 0..literal_shard_count {
|
||||
let shard_number = ShardNumber(i);
|
||||
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count: create_req.shard_parameters.count,
|
||||
};
|
||||
tracing::info!("Creating shard {tenant_shard_id}...");
|
||||
|
||||
use std::collections::btree_map::Entry;
|
||||
match locked.tenants.entry(tenant_shard_id) {
|
||||
Entry::Occupied(mut entry) => {
|
||||
tracing::info!(
|
||||
"Tenant shard {tenant_shard_id} already exists while creating"
|
||||
);
|
||||
|
||||
// TODO: schedule() should take an anti-affinity expression that pushes
|
||||
// attached and secondary locations (independently) away frorm those
|
||||
// pageservers also holding a shard for this tenant.
|
||||
|
||||
entry.get_mut().schedule(&mut scheduler).map_err(|e| {
|
||||
ApiError::Conflict(format!(
|
||||
"Failed to schedule shard {tenant_shard_id}: {e}"
|
||||
))
|
||||
})?;
|
||||
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
node_id: entry
|
||||
.get()
|
||||
.intent
|
||||
.attached
|
||||
.expect("We just set pageserver if it was None"),
|
||||
generation: entry.get().generation.into().unwrap(),
|
||||
});
|
||||
|
||||
continue;
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
let mut state = TenantState::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::from_params(shard_number, &create_req.shard_parameters),
|
||||
PlacementPolicy::Single,
|
||||
);
|
||||
|
||||
if let Some(create_gen) = create_req.generation {
|
||||
state.generation = Generation::new(create_gen);
|
||||
}
|
||||
state.config = create_req.config.clone();
|
||||
|
||||
state.schedule(&mut scheduler).map_err(|e| {
|
||||
ApiError::Conflict(format!(
|
||||
"Failed to schedule shard {tenant_shard_id}: {e}"
|
||||
))
|
||||
})?;
|
||||
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
node_id: state
|
||||
.intent
|
||||
.attached
|
||||
.expect("We just set pageserver if it was None"),
|
||||
generation: state.generation.into().unwrap(),
|
||||
});
|
||||
entry.insert(state)
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Take a snapshot of pageservers
|
||||
let pageservers = locked.nodes.clone();
|
||||
|
||||
let mut waiters = Vec::new();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
|
||||
for (_tenant_shard_id, shard) in locked
|
||||
.tenants
|
||||
.range_mut(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
if let Some(waiter) =
|
||||
shard.maybe_reconcile(result_tx.clone(), &pageservers, &compute_hook)
|
||||
{
|
||||
waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
(waiters, response_shards)
|
||||
};
|
||||
|
||||
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
|
||||
for waiter in waiters {
|
||||
let timeout = deadline.duration_since(Instant::now());
|
||||
waiter.wait_timeout(timeout).await.map_err(|_| {
|
||||
ApiError::Timeout(
|
||||
format!(
|
||||
"Timeout waiting for reconciliation of tenant shard {}",
|
||||
waiter.tenant_shard_id
|
||||
)
|
||||
.into(),
|
||||
)
|
||||
})?;
|
||||
}
|
||||
Ok(TenantCreateResponse {
|
||||
shards: response_shards,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_timeline_create(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
mut create_req: TimelineCreateRequest,
|
||||
) -> Result<TimelineInfo, ApiError> {
|
||||
let mut timeline_info = None;
|
||||
let ensure_waiters = {
|
||||
let locked = self.inner.write().unwrap();
|
||||
|
||||
tracing::info!(
|
||||
"Creating timeline {}/{}, have {} pageservers",
|
||||
tenant_id,
|
||||
create_req.new_timeline_id,
|
||||
locked.nodes.len()
|
||||
);
|
||||
|
||||
ensure_attached(locked, tenant_id).map_err(ApiError::InternalServerError)?
|
||||
};
|
||||
|
||||
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
|
||||
for waiter in ensure_waiters {
|
||||
let timeout = deadline.duration_since(Instant::now());
|
||||
waiter.wait_timeout(timeout).await.map_err(|_| {
|
||||
ApiError::Timeout(
|
||||
format!(
|
||||
"Timeout waiting for reconciliation of tenant shard {}",
|
||||
waiter.tenant_shard_id
|
||||
)
|
||||
.into(),
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut targets = Vec::new();
|
||||
|
||||
for (tenant_shard_id, shard) in
|
||||
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
let node_id = shard.intent.attached.ok_or_else(|| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
|
||||
})?;
|
||||
let node = locked
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
|
||||
targets.push((*tenant_shard_id, node.clone()));
|
||||
}
|
||||
targets
|
||||
};
|
||||
|
||||
if targets.is_empty() {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant not found").into(),
|
||||
));
|
||||
}
|
||||
|
||||
for (tenant_shard_id, node) in targets {
|
||||
// TODO: issue shard timeline creates in parallel, once the 0th is done.
|
||||
|
||||
let shard_timeline_info = timeline_create(&tenant_shard_id, &node, &create_req)
|
||||
.await
|
||||
.map_err(|e| ApiError::Conflict(format!("Failed to create timeline: {e}")))?;
|
||||
|
||||
if timeline_info.is_none() {
|
||||
// If the caller specified an ancestor but no ancestor LSN, we are responsible for
|
||||
// propagating the LSN chosen by the first shard to the other shards: it is important
|
||||
// that all shards end up with the same ancestor_start_lsn.
|
||||
if create_req.ancestor_timeline_id.is_some()
|
||||
&& create_req.ancestor_start_lsn.is_none()
|
||||
{
|
||||
create_req.ancestor_start_lsn = shard_timeline_info.ancestor_lsn;
|
||||
}
|
||||
|
||||
// We will return the TimelineInfo from the first shard
|
||||
timeline_info = Some(shard_timeline_info);
|
||||
}
|
||||
}
|
||||
Ok(timeline_info.expect("targets cannot be empty"))
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_locate(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<TenantLocateResponse, ApiError> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
tracing::info!("Locating shards for tenant {tenant_id}");
|
||||
|
||||
// Take a snapshot of pageservers
|
||||
let pageservers = locked.nodes.clone();
|
||||
|
||||
let mut result = Vec::new();
|
||||
let mut shard_params: Option<ShardParameters> = None;
|
||||
|
||||
for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
let node_id = shard
|
||||
.intent
|
||||
.attached
|
||||
.ok_or(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot locate a tenant that is not attached"
|
||||
)))?;
|
||||
|
||||
let node = pageservers
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
|
||||
result.push(TenantLocateResponseShard {
|
||||
shard_id: *tenant_shard_id,
|
||||
node_id,
|
||||
listen_http_addr: node.listen_http_addr.clone(),
|
||||
listen_http_port: node.listen_http_port,
|
||||
listen_pg_addr: node.listen_pg_addr.clone(),
|
||||
listen_pg_port: node.listen_pg_port,
|
||||
});
|
||||
|
||||
match &shard_params {
|
||||
None => {
|
||||
shard_params = Some(ShardParameters {
|
||||
stripe_size: Some(shard.shard.stripe_size),
|
||||
count: shard.shard.count,
|
||||
});
|
||||
}
|
||||
Some(params) => {
|
||||
if params.stripe_size != Some(shard.shard.stripe_size) {
|
||||
// This should never happen. We enforce at runtime because it's simpler than
|
||||
// adding an extra per-tenant data structure to store the things that should be the same
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Inconsistent shard stripe size parameters!"
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if result.is_empty() {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("No shards for this tenant ID found").into(),
|
||||
));
|
||||
}
|
||||
let shard_params = shard_params.expect("result is non-empty, therefore this is set");
|
||||
tracing::info!(
|
||||
"Located tenant {} with params {:?} on shards {}",
|
||||
tenant_id,
|
||||
shard_params,
|
||||
result
|
||||
.iter()
|
||||
.map(|s| format!("{:?}", s))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
|
||||
Ok(TenantLocateResponse {
|
||||
shards: result,
|
||||
shard_params,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_shard_split(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
split_req: TenantShardSplitRequest,
|
||||
) -> Result<TenantShardSplitResponse, ApiError> {
|
||||
let mut policy = None;
|
||||
let (targets, compute_hook) = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
|
||||
let pageservers = locked.nodes.clone();
|
||||
|
||||
let mut targets = Vec::new();
|
||||
|
||||
for (tenant_shard_id, shard) in locked
|
||||
.tenants
|
||||
.range_mut(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
if policy.is_none() {
|
||||
policy = Some(shard.policy.clone());
|
||||
}
|
||||
|
||||
if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
|
||||
tracing::warn!(
|
||||
"Tenant shard {} already has shard count {}",
|
||||
tenant_shard_id,
|
||||
split_req.new_shard_count
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
let node_id =
|
||||
shard
|
||||
.intent
|
||||
.attached
|
||||
.ok_or(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot split a tenant that is not attached"
|
||||
)))?;
|
||||
|
||||
let node = pageservers
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
|
||||
// TODO: if any reconciliation is currently in progress for this shard, wait for it.
|
||||
|
||||
targets.push((*tenant_shard_id, node.clone()));
|
||||
}
|
||||
(targets, locked.compute_hook.clone())
|
||||
};
|
||||
|
||||
let mut replacements = HashMap::new();
|
||||
for (tenant_shard_id, node) in targets {
|
||||
let client = Client::new();
|
||||
let response = client
|
||||
.request(
|
||||
Method::PUT,
|
||||
format!("{}/tenant/{}/shard_split", node.base_url(), tenant_shard_id),
|
||||
)
|
||||
.json(&TenantShardSplitRequest {
|
||||
new_shard_count: split_req.new_shard_count,
|
||||
})
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ApiError::Conflict(format!("Failed to split {}: {}", tenant_shard_id, e))
|
||||
})?;
|
||||
response.error_for_status_ref().map_err(|e| {
|
||||
ApiError::Conflict(format!("Failed to split {}: {}", tenant_shard_id, e))
|
||||
})?;
|
||||
let response: TenantShardSplitResponse = response.json().await.map_err(|e| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Malformed response from pageserver: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
|
||||
tracing::info!(
|
||||
"Split {} into {}",
|
||||
tenant_shard_id,
|
||||
response
|
||||
.new_shards
|
||||
.iter()
|
||||
.map(|s| format!("{:?}", s))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
|
||||
replacements.insert(tenant_shard_id, response.new_shards);
|
||||
}
|
||||
|
||||
// TODO: concurrency: we're dropping the state lock while issuing split API calls.
|
||||
// We should add some marker to the TenantState that causes any other change
|
||||
// to refuse until the split is complete. This will be related to a persistent
|
||||
// splitting marker that will ensure resume after crash.
|
||||
|
||||
// Replace all the shards we just split with their children
|
||||
let mut response = TenantShardSplitResponse {
|
||||
new_shards: Vec::new(),
|
||||
};
|
||||
let mut child_locations = Vec::new();
|
||||
{
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
for (replaced, children) in replacements.into_iter() {
|
||||
let (pageserver, generation, shard_ident, config) = {
|
||||
let old_state = locked
|
||||
.tenants
|
||||
.remove(&replaced)
|
||||
.expect("It was present, we just split it");
|
||||
(
|
||||
old_state.intent.attached.unwrap(),
|
||||
old_state.generation,
|
||||
old_state.shard,
|
||||
old_state.config.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
locked.tenants.remove(&replaced);
|
||||
|
||||
for child in children {
|
||||
let mut child_shard = shard_ident;
|
||||
child_shard.number = child.shard_number;
|
||||
child_shard.count = child.shard_count;
|
||||
|
||||
let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
|
||||
child_observed.insert(
|
||||
pageserver,
|
||||
ObservedStateLocation {
|
||||
conf: Some(attached_location_conf(generation, &child_shard, &config)),
|
||||
},
|
||||
);
|
||||
|
||||
let mut child_state = TenantState::new(
|
||||
child,
|
||||
child_shard,
|
||||
policy
|
||||
.clone()
|
||||
.expect("We set this if any replacements are pushed"),
|
||||
);
|
||||
child_state.intent = IntentState::single(Some(pageserver));
|
||||
child_state.observed = ObservedState {
|
||||
locations: child_observed,
|
||||
};
|
||||
child_state.generation = generation;
|
||||
child_state.config = config.clone();
|
||||
|
||||
child_locations.push((child, pageserver));
|
||||
|
||||
locked.tenants.insert(child, child_state);
|
||||
response.new_shards.push(child);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (child_id, child_ps) in child_locations {
|
||||
if let Err(e) = compute_hook.notify(child_id, child_ps).await {
|
||||
tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
|
||||
child_id, child_ps);
|
||||
}
|
||||
}
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_shard_migrate(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
migrate_req: TenantShardMigrateRequest,
|
||||
) -> Result<TenantShardMigrateResponse, ApiError> {
|
||||
let waiter = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let pageservers = locked.nodes.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
|
||||
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant shard not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
if shard.intent.attached == Some(migrate_req.node_id) {
|
||||
// No-op case: we will still proceed to wait for reconciliation in case it is
|
||||
// incomplete from an earlier update to the intent.
|
||||
tracing::info!("Migrating: intent is unchanged {:?}", shard.intent);
|
||||
} else {
|
||||
let old_attached = shard.intent.attached;
|
||||
|
||||
shard.intent.attached = Some(migrate_req.node_id);
|
||||
match shard.policy {
|
||||
PlacementPolicy::Single => {
|
||||
shard.intent.secondary.clear();
|
||||
}
|
||||
PlacementPolicy::Double(_n) => {
|
||||
// If our new attached node was a secondary, it no longer should be.
|
||||
shard.intent.secondary.retain(|s| s != &migrate_req.node_id);
|
||||
|
||||
// If we were already attached to something, demote that to a secondary
|
||||
if let Some(old_attached) = old_attached {
|
||||
shard.intent.secondary.push(old_attached);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Migrating: new intent {:?}", shard.intent);
|
||||
shard.sequence = shard.sequence.next();
|
||||
}
|
||||
|
||||
shard.maybe_reconcile(result_tx, &pageservers, &compute_hook)
|
||||
};
|
||||
|
||||
if let Some(waiter) = waiter {
|
||||
waiter
|
||||
.wait_timeout(RECONCILE_TIMEOUT)
|
||||
.await
|
||||
.map_err(|e| ApiError::Timeout(format!("{}", e).into()))?;
|
||||
} else {
|
||||
tracing::warn!("Migration is a no-op");
|
||||
}
|
||||
|
||||
Ok(TenantShardMigrateResponse {})
|
||||
}
|
||||
|
||||
pub(crate) fn node_register(&self, register_req: NodeRegisterRequest) {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
|
||||
let mut new_nodes = (*locked.nodes).clone();
|
||||
|
||||
new_nodes.insert(
|
||||
register_req.node_id,
|
||||
Node {
|
||||
id: register_req.node_id,
|
||||
listen_http_addr: register_req.listen_http_addr,
|
||||
listen_http_port: register_req.listen_http_port,
|
||||
listen_pg_addr: register_req.listen_pg_addr,
|
||||
listen_pg_port: register_req.listen_pg_port,
|
||||
scheduling: NodeSchedulingPolicy::Filling,
|
||||
// TODO: we shouldn't really call this Active until we've heartbeated it.
|
||||
availability: NodeAvailability::Active,
|
||||
},
|
||||
);
|
||||
|
||||
locked.nodes = Arc::new(new_nodes);
|
||||
|
||||
tracing::info!(
|
||||
"Registered pageserver {}, now have {} pageservers",
|
||||
register_req.node_id,
|
||||
locked.nodes.len()
|
||||
);
|
||||
}
|
||||
|
||||
pub(crate) fn node_configure(&self, config_req: NodeConfigureRequest) -> Result<(), ApiError> {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
|
||||
let mut new_nodes = (*locked.nodes).clone();
|
||||
|
||||
let Some(node) = new_nodes.get_mut(&config_req.node_id) else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Node not registered").into(),
|
||||
));
|
||||
};
|
||||
|
||||
let mut offline_transition = false;
|
||||
let mut active_transition = false;
|
||||
|
||||
if let Some(availability) = &config_req.availability {
|
||||
match (availability, &node.availability) {
|
||||
(NodeAvailability::Offline, NodeAvailability::Active) => {
|
||||
tracing::info!("Node {} transition to offline", config_req.node_id);
|
||||
offline_transition = true;
|
||||
}
|
||||
(NodeAvailability::Active, NodeAvailability::Offline) => {
|
||||
tracing::info!("Node {} transition to active", config_req.node_id);
|
||||
active_transition = true;
|
||||
}
|
||||
_ => {
|
||||
tracing::info!("Node {} no change during config", config_req.node_id);
|
||||
// No change
|
||||
}
|
||||
};
|
||||
node.availability = *availability;
|
||||
}
|
||||
|
||||
if let Some(scheduling) = config_req.scheduling {
|
||||
node.scheduling = scheduling;
|
||||
|
||||
// TODO: once we have a background scheduling ticker for fill/drain, kick it
|
||||
// to wake up and start working.
|
||||
}
|
||||
|
||||
let new_nodes = Arc::new(new_nodes);
|
||||
|
||||
let mut scheduler = Scheduler::new(&locked.tenants, &new_nodes);
|
||||
if offline_transition {
|
||||
for (tenant_shard_id, tenant_state) in &mut locked.tenants {
|
||||
if let Some(observed_loc) =
|
||||
tenant_state.observed.locations.get_mut(&config_req.node_id)
|
||||
{
|
||||
// When a node goes offline, we set its observed configuration to None, indicating unknown: we will
|
||||
// not assume our knowledge of the node's configuration is accurate until it comes back online
|
||||
observed_loc.conf = None;
|
||||
}
|
||||
|
||||
if tenant_state.intent.notify_offline(config_req.node_id) {
|
||||
tenant_state.sequence = tenant_state.sequence.next();
|
||||
match tenant_state.schedule(&mut scheduler) {
|
||||
Err(e) => {
|
||||
// It is possible that some tenants will become unschedulable when too many pageservers
|
||||
// go offline: in this case there isn't much we can do other than make the issue observable.
|
||||
// TODO: give TenantState a scheduling error attribute to be queried later.
|
||||
tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
|
||||
}
|
||||
Ok(()) => {
|
||||
tenant_state.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
&new_nodes,
|
||||
&compute_hook,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if active_transition {
|
||||
// When a node comes back online, we must reconcile any tenant that has a None observed
|
||||
// location on the node.
|
||||
for tenant_state in locked.tenants.values_mut() {
|
||||
if let Some(observed_loc) =
|
||||
tenant_state.observed.locations.get_mut(&config_req.node_id)
|
||||
{
|
||||
if observed_loc.conf.is_none() {
|
||||
tenant_state.maybe_reconcile(result_tx.clone(), &new_nodes, &compute_hook);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: in the background, we should balance work back onto this pageserver
|
||||
}
|
||||
|
||||
locked.nodes = new_nodes;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: use mgmt_api client
|
||||
async fn timeline_create(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
node: &Node,
|
||||
req: &TimelineCreateRequest,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let client = Client::new();
|
||||
let response = client
|
||||
.request(
|
||||
Method::POST,
|
||||
format!("{}/tenant/{}/timeline", node.base_url(), tenant_shard_id),
|
||||
)
|
||||
.json(req)
|
||||
.send()
|
||||
.await?;
|
||||
response.error_for_status_ref()?;
|
||||
|
||||
Ok(response.json().await?)
|
||||
}
|
||||
|
||||
fn ensure_attached(
|
||||
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<Vec<ReconcilerWaiter>, anyhow::Error> {
|
||||
let mut waiters = Vec::new();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
|
||||
let pageservers = locked.nodes.clone();
|
||||
|
||||
for (_tenant_shard_id, shard) in locked
|
||||
.tenants
|
||||
.range_mut(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
shard.schedule(&mut scheduler)?;
|
||||
|
||||
if let Some(waiter) = shard.maybe_reconcile(result_tx.clone(), &pageservers, &compute_hook)
|
||||
{
|
||||
waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
Ok(waiters)
|
||||
}
|
||||
351
control_plane/attachment_service/src/tenant_state.rs
Normal file
351
control_plane/attachment_service/src/tenant_state.rs
Normal file
@@ -0,0 +1,351 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use control_plane::attachment_service::NodeAvailability;
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::NodeId,
|
||||
seqwait::{SeqWait, SeqWaitError},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
compute_hook::ComputeHook,
|
||||
node::Node,
|
||||
reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
|
||||
scheduler::{ScheduleError, Scheduler},
|
||||
PlacementPolicy, Sequence,
|
||||
};
|
||||
|
||||
pub(crate) struct TenantState {
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
|
||||
pub(crate) shard: ShardIdentity,
|
||||
|
||||
pub(crate) sequence: Sequence,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching
|
||||
pub(crate) generation: Generation,
|
||||
|
||||
// High level description of how the tenant should be set up. Provided
|
||||
// externally.
|
||||
pub(crate) policy: PlacementPolicy,
|
||||
|
||||
// Low level description of exactly which pageservers should fulfil
|
||||
// which role. Generated by `Self::schedule`.
|
||||
pub(crate) intent: IntentState,
|
||||
|
||||
// Low level description of how the tenant is configured on pageservers:
|
||||
// if this does not match `Self::intent` then the tenant needs reconciliation
|
||||
// with `Self::reconcile`.
|
||||
pub(crate) observed: ObservedState,
|
||||
|
||||
pub(crate) config: TenantConfig,
|
||||
|
||||
/// If a reconcile task is currently in flight, it may be joined here (it is
|
||||
/// only safe to join if either the result has been received or the reconciler's
|
||||
/// cancellation token has been fired)
|
||||
pub(crate) reconciler: Option<ReconcilerHandle>,
|
||||
|
||||
/// Optionally wait for reconciliation to complete up to a particular
|
||||
/// sequence number.
|
||||
pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug)]
|
||||
pub(crate) struct IntentState {
|
||||
pub(crate) attached: Option<NodeId>,
|
||||
pub(crate) secondary: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub(crate) struct ObservedState {
|
||||
pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
|
||||
}
|
||||
|
||||
/// Our latest knowledge of how this tenant is configured in the outside world.
|
||||
///
|
||||
/// Meaning:
|
||||
/// * No instance of this type exists for a node: we are certain that we have nothing configured on that
|
||||
/// node for this shard.
|
||||
/// * Instance exists with conf==None: we *might* have some state on that node, but we don't know
|
||||
/// what it is (e.g. we failed partway through configuring it)
|
||||
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
|
||||
/// and that configuration will still be present unless something external interfered.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct ObservedStateLocation {
|
||||
/// If None, it means we do not know the status of this shard's location on this node, but
|
||||
/// we know that we might have some state on this node.
|
||||
pub(crate) conf: Option<LocationConfig>,
|
||||
}
|
||||
pub(crate) struct ReconcilerWaiter {
|
||||
// For observability purposes, remember the ID of the shard we're
|
||||
// waiting for.
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
|
||||
seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
seq: Sequence,
|
||||
}
|
||||
|
||||
impl ReconcilerWaiter {
|
||||
pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), SeqWaitError> {
|
||||
self.seq_wait.wait_for_timeout(self.seq, timeout).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Having spawned a reconciler task, the tenant shard's state will carry enough
|
||||
/// information to optionally cancel & await it later.
|
||||
pub(crate) struct ReconcilerHandle {
|
||||
sequence: Sequence,
|
||||
handle: JoinHandle<()>,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
/// When a reconcile task completes, it sends this result object
|
||||
/// to be applied to the primary TenantState.
|
||||
pub(crate) struct ReconcileResult {
|
||||
pub(crate) sequence: Sequence,
|
||||
/// On errors, `observed` should be treated as an incompleted description
|
||||
/// of state (i.e. any nodes present in the result should override nodes
|
||||
/// present in the parent tenant state, but any unmentioned nodes should
|
||||
/// not be removed from parent tenant state)
|
||||
pub(crate) result: Result<(), ReconcileError>,
|
||||
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) observed: ObservedState,
|
||||
}
|
||||
|
||||
impl IntentState {
|
||||
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = Vec::new();
|
||||
if let Some(p) = self.attached {
|
||||
result.push(p)
|
||||
}
|
||||
|
||||
result.extend(self.secondary.iter().copied());
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub(crate) fn single(node_id: Option<NodeId>) -> Self {
|
||||
Self {
|
||||
attached: node_id,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// When a node goes offline, we update intents to avoid using it
|
||||
/// as their attached pageserver.
|
||||
///
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
self.attached = None;
|
||||
self.secondary.push(node_id);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantState {
|
||||
pub(crate) fn new(
|
||||
tenant_shard_id: TenantShardId,
|
||||
shard: ShardIdentity,
|
||||
policy: PlacementPolicy,
|
||||
) -> Self {
|
||||
Self {
|
||||
tenant_shard_id,
|
||||
policy,
|
||||
intent: IntentState::default(),
|
||||
generation: Generation::new(0),
|
||||
shard,
|
||||
observed: ObservedState::default(),
|
||||
config: TenantConfig::default(),
|
||||
reconciler: None,
|
||||
sequence: Sequence(1),
|
||||
waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
|
||||
// TODO: before scheduling new nodes, check if any existing content in
|
||||
// self.intent refers to pageservers that are offline, and pick other
|
||||
// pageservers if so.
|
||||
|
||||
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
||||
// more work on the same pageservers we're already using.
|
||||
let mut used_pageservers = self.intent.all_pageservers();
|
||||
let mut modified = false;
|
||||
|
||||
use PlacementPolicy::*;
|
||||
match self.policy {
|
||||
Single => {
|
||||
// Should have exactly one attached, and zero secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.attached = Some(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.secondary.clear();
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Double(secondary_count) => {
|
||||
// Should have exactly one attached, and N secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.attached = Some(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
while self.intent.secondary.len() < secondary_count {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.secondary.push(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if modified {
|
||||
self.sequence.0 += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn dirty(&self) -> bool {
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for node_id in &self.intent.secondary {
|
||||
let wanted_conf = secondary_location_conf(&self.shard, &self.config);
|
||||
match self.observed.locations.get(node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
pub(crate) fn maybe_reconcile(
|
||||
&mut self,
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
pageservers: &Arc<HashMap<NodeId, Node>>,
|
||||
compute_hook: &Arc<ComputeHook>,
|
||||
) -> Option<ReconcilerWaiter> {
|
||||
// If there are any ambiguous observed states, and the nodes they refer to are available,
|
||||
// we should reconcile to clean them up.
|
||||
let mut dirty_observed = false;
|
||||
for (node_id, observed_loc) in &self.observed.locations {
|
||||
let node = pageservers
|
||||
.get(node_id)
|
||||
.expect("Nodes may not be removed while referenced");
|
||||
if observed_loc.conf.is_none()
|
||||
&& !matches!(node.availability, NodeAvailability::Offline)
|
||||
{
|
||||
dirty_observed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !self.dirty() && !dirty_observed {
|
||||
tracing::info!("Not dirty, no reconciliation needed.");
|
||||
return None;
|
||||
}
|
||||
|
||||
// Reconcile already in flight for the current sequence?
|
||||
if let Some(handle) = &self.reconciler {
|
||||
if handle.sequence == self.sequence {
|
||||
return Some(ReconcilerWaiter {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
seq_wait: self.waiter.clone(),
|
||||
seq: self.sequence,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Reconcile in flight for a stale sequence? Our sequence's task will wait for it before
|
||||
// doing our sequence's work.
|
||||
let old_handle = self.reconciler.take();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let mut reconciler = Reconciler {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
shard: self.shard,
|
||||
generation: self.generation,
|
||||
intent: self.intent.clone(),
|
||||
config: self.config.clone(),
|
||||
observed: self.observed.clone(),
|
||||
pageservers: pageservers.clone(),
|
||||
compute_hook: compute_hook.clone(),
|
||||
cancel: cancel.clone(),
|
||||
};
|
||||
|
||||
let reconcile_seq = self.sequence;
|
||||
|
||||
tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
|
||||
let join_handle = tokio::task::spawn(async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
if let Some(old_handle) = old_handle {
|
||||
old_handle.cancel.cancel();
|
||||
if let Err(e) = old_handle.handle.await {
|
||||
// We can't do much with this other than log it: the task is done, so
|
||||
// we may proceed with our work.
|
||||
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
// Early check for cancellation before doing any work
|
||||
// TODO: wrap all remote API operations in cancellation check
|
||||
// as well.
|
||||
if reconciler.cancel.is_cancelled() {
|
||||
return;
|
||||
}
|
||||
|
||||
let result = reconciler.reconcile().await;
|
||||
result_tx
|
||||
.send(ReconcileResult {
|
||||
sequence: reconcile_seq,
|
||||
result,
|
||||
tenant_shard_id: reconciler.tenant_shard_id,
|
||||
generation: reconciler.generation,
|
||||
observed: reconciler.observed,
|
||||
})
|
||||
.ok();
|
||||
});
|
||||
|
||||
self.reconciler = Some(ReconcilerHandle {
|
||||
sequence: self.sequence,
|
||||
handle: join_handle,
|
||||
cancel,
|
||||
});
|
||||
|
||||
Some(ReconcilerWaiter {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
seq_wait: self.waiter.clone(),
|
||||
seq: self.sequence,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,18 @@
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
use anyhow::anyhow;
|
||||
use camino::Utf8PathBuf;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{path::PathBuf, process::Child};
|
||||
use hyper::{Method, StatusCode};
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use postgres_connection::parse_host_port;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{path::PathBuf, process::Child, str::FromStr};
|
||||
use tracing::instrument;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
pub struct AttachmentService {
|
||||
@@ -16,7 +26,7 @@ const COMMAND: &str = "attachment_service";
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: Option<NodeId>,
|
||||
}
|
||||
|
||||
@@ -27,7 +37,7 @@ pub struct AttachHookResponse {
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct InspectRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -35,6 +45,117 @@ pub struct InspectResponse {
|
||||
pub attachment: Option<(u32, NodeId)>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
pub node_id: NodeId,
|
||||
pub generation: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponse {
|
||||
pub shards: Vec<TenantCreateResponseShard>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeRegisterRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeConfigureRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub availability: Option<NodeAvailability>,
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantLocateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantLocateResponse {
|
||||
pub shards: Vec<TenantLocateResponseShard>,
|
||||
pub shard_params: ShardParameters,
|
||||
}
|
||||
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active,
|
||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
||||
// secondary locations on this node still exist. Newly added nodes are in this
|
||||
// state until we successfully contact them.
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl FromStr for NodeAvailability {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"offline" => Ok(Self::Offline),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
// Normal, happy state
|
||||
Active,
|
||||
|
||||
// A newly added node: gradually move some work here.
|
||||
Filling,
|
||||
|
||||
// Do not schedule new work here, but leave configured locations in place.
|
||||
Pause,
|
||||
|
||||
// Do not schedule work here. Gracefully move work away, as resources allow.
|
||||
Draining,
|
||||
}
|
||||
|
||||
impl FromStr for NodeSchedulingPolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"filling" => Ok(Self::Filling),
|
||||
"pause" => Ok(Self::Pause),
|
||||
"draining" => Ok(Self::Draining),
|
||||
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
|
||||
impl AttachmentService {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = env.base_data_dir.join("attachments.json");
|
||||
@@ -67,31 +188,87 @@ impl AttachmentService {
|
||||
pub async fn start(&self) -> anyhow::Result<Child> {
|
||||
let path_str = self.path.to_string_lossy();
|
||||
|
||||
background_process::start_process(
|
||||
let result = background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
&self.env.attachment_service_bin(),
|
||||
["-l", &self.listen, "-p", &path_str],
|
||||
[],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
// TODO: a real status check
|
||||
|| async move { anyhow::Ok(true) },
|
||||
|| async {
|
||||
match self.status().await {
|
||||
Ok(_) => Ok(true),
|
||||
Err(_) => Ok(false),
|
||||
}
|
||||
},
|
||||
)
|
||||
.await
|
||||
.await;
|
||||
|
||||
for ps_conf in &self.env.pageservers {
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
self.node_register(NodeRegisterRequest {
|
||||
node_id: ps_conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into attachment service
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: hyper::Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> anyhow::Result<RS>
|
||||
where
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
let url = self
|
||||
.env
|
||||
.control_plane_api
|
||||
.clone()
|
||||
.unwrap()
|
||||
.join(&path)
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
builder = builder.json(&body)
|
||||
}
|
||||
|
||||
let response = builder.send().await?;
|
||||
if response.status() != StatusCode::OK {
|
||||
return Err(anyhow!(
|
||||
"Unexpected status {} on {}",
|
||||
response.status(),
|
||||
path
|
||||
));
|
||||
}
|
||||
|
||||
Ok(response.json().await?)
|
||||
}
|
||||
|
||||
/// Call into the attach_hook API, for use before handing out attachments to pageservers
|
||||
#[instrument(skip(self))]
|
||||
pub async fn attach_hook(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pageserver_id: NodeId,
|
||||
) -> anyhow::Result<Option<u32>> {
|
||||
use hyper::StatusCode;
|
||||
|
||||
let url = self
|
||||
.env
|
||||
.control_plane_api
|
||||
@@ -101,7 +278,7 @@ impl AttachmentService {
|
||||
.unwrap();
|
||||
|
||||
let request = AttachHookRequest {
|
||||
tenant_id,
|
||||
tenant_shard_id,
|
||||
node_id: Some(pageserver_id),
|
||||
};
|
||||
|
||||
@@ -114,9 +291,11 @@ impl AttachmentService {
|
||||
Ok(response.gen)
|
||||
}
|
||||
|
||||
pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
|
||||
use hyper::StatusCode;
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn inspect(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> anyhow::Result<Option<(u32, NodeId)>> {
|
||||
let url = self
|
||||
.env
|
||||
.control_plane_api
|
||||
@@ -125,7 +304,7 @@ impl AttachmentService {
|
||||
.join("inspect")
|
||||
.unwrap();
|
||||
|
||||
let request = InspectRequest { tenant_id };
|
||||
let request = InspectRequest { tenant_shard_id };
|
||||
|
||||
let response = self.client.post(url).json(&request).send().await?;
|
||||
if response.status() != StatusCode::OK {
|
||||
@@ -135,4 +314,86 @@ impl AttachmentService {
|
||||
let response = response.json::<InspectResponse>().await?;
|
||||
Ok(response.attachment)
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_create(
|
||||
&self,
|
||||
req: TenantCreateRequest,
|
||||
) -> anyhow::Result<TenantCreateResponse> {
|
||||
self.dispatch(Method::POST, "tenant".to_string(), Some(req))
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
||||
self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_migrate(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<TenantShardMigrateResponse> {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("tenant/{tenant_shard_id}/migrate"),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id,
|
||||
node_id,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
|
||||
pub async fn tenant_split(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
new_shard_count: u8,
|
||||
) -> anyhow::Result<TenantShardSplitResponse> {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("tenant/{tenant_id}/shard_split"),
|
||||
Some(TenantShardSplitRequest { new_shard_count }),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
||||
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
|
||||
self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
||||
pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
|
||||
self.dispatch::<_, ()>(
|
||||
Method::PUT,
|
||||
format!("node/{}/config", req.node_id),
|
||||
Some(req),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn status(&self) -> anyhow::Result<()> {
|
||||
self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))]
|
||||
pub async fn tenant_timeline_create(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
req: TimelineCreateRequest,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
self.dispatch(
|
||||
Method::POST,
|
||||
format!("tenant/{tenant_id}/timeline"),
|
||||
Some(req),
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,337 +0,0 @@
|
||||
/// The attachment service mimics the aspects of the control plane API
|
||||
/// that are required for a pageserver to operate.
|
||||
///
|
||||
/// This enables running & testing pageservers without a full-blown
|
||||
/// deployment of the Neon cloud platform.
|
||||
///
|
||||
use anyhow::anyhow;
|
||||
use clap::Parser;
|
||||
use hex::FromHex;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::logging::{self, LogFormat};
|
||||
use utils::signals::{ShutdownSignals, Signal};
|
||||
|
||||
use utils::{
|
||||
http::{
|
||||
endpoint::{self},
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
id::{NodeId, TenantId},
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
use pageserver_api::control_api::{
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
|
||||
ValidateResponseTenant,
|
||||
};
|
||||
|
||||
use control_plane::attachment_service::{
|
||||
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
|
||||
};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
#[command(arg_required_else_help(true))]
|
||||
struct Cli {
|
||||
/// Host and port to listen on, like `127.0.0.1:1234`
|
||||
#[arg(short, long)]
|
||||
listen: std::net::SocketAddr,
|
||||
|
||||
/// Path to the .json file to store state (will be created if it doesn't exist)
|
||||
#[arg(short, long)]
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
// The persistent state of each Tenant
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
struct TenantState {
|
||||
// Currently attached pageserver
|
||||
pageserver: Option<NodeId>,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching
|
||||
generation: u32,
|
||||
}
|
||||
|
||||
fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
V: Clone + Serialize,
|
||||
{
|
||||
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
|
||||
|
||||
transformed
|
||||
.collect::<HashMap<String, V>>()
|
||||
.serialize(serializer)
|
||||
}
|
||||
|
||||
fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
V: Deserialize<'de>,
|
||||
{
|
||||
let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
|
||||
hex_map
|
||||
.into_iter()
|
||||
.map(|(k, v)| {
|
||||
TenantId::from_hex(k)
|
||||
.map(|k| (k, v))
|
||||
.map_err(serde::de::Error::custom)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// Top level state available to all HTTP handlers
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PersistentState {
|
||||
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
|
||||
tenants: HashMap<TenantId, TenantState>,
|
||||
|
||||
#[serde(skip)]
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl PersistentState {
|
||||
async fn save(&self) -> anyhow::Result<()> {
|
||||
let bytes = serde_json::to_vec(self)?;
|
||||
tokio::fs::write(&self.path, &bytes).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn load(path: &Path) -> anyhow::Result<Self> {
|
||||
let bytes = tokio::fs::read(path).await?;
|
||||
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
|
||||
decoded.path = path.to_owned();
|
||||
Ok(decoded)
|
||||
}
|
||||
|
||||
async fn load_or_new(path: &Path) -> Self {
|
||||
match Self::load(path).await {
|
||||
Ok(s) => {
|
||||
tracing::info!("Loaded state file at {}", path.display());
|
||||
s
|
||||
}
|
||||
Err(e)
|
||||
if e.downcast_ref::<std::io::Error>()
|
||||
.map(|e| e.kind() == std::io::ErrorKind::NotFound)
|
||||
.unwrap_or(false) =>
|
||||
{
|
||||
tracing::info!("Will create state file at {}", path.display());
|
||||
Self {
|
||||
tenants: HashMap::new(),
|
||||
path: path.to_owned(),
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
struct State {
|
||||
inner: Arc<tokio::sync::RwLock<PersistentState>>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn new(persistent_state: PersistentState) -> State {
|
||||
Self {
|
||||
inner: Arc::new(tokio::sync::RwLock::new(persistent_state)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn get_state(request: &Request<Body>) -> &State {
|
||||
request
|
||||
.data::<Arc<State>>()
|
||||
.expect("unknown state type")
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
/// Pageserver calls into this on startup, to learn which tenants it should attach
|
||||
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req).inner.clone();
|
||||
let mut locked = state.write().await;
|
||||
|
||||
let mut response = ReAttachResponse {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
for (t, state) in &mut locked.tenants {
|
||||
if state.pageserver == Some(reattach_req.node_id) {
|
||||
state.generation += 1;
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
// TODO(sharding): make this shard-aware
|
||||
id: TenantShardId::unsharded(*t),
|
||||
gen: state.generation,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Pageserver calls into this before doing deletions, to confirm that it still
|
||||
/// holds the latest generation for the tenants with deletions enqueued
|
||||
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
|
||||
|
||||
let locked = get_state(&req).inner.read().await;
|
||||
|
||||
let mut response = ValidateResponse {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
|
||||
for req_tenant in validate_req.tenants {
|
||||
// TODO(sharding): make this shard-aware
|
||||
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
|
||||
let valid = tenant_state.generation == req_tenant.gen;
|
||||
tracing::info!(
|
||||
"handle_validate: {}(gen {}): valid={valid} (latest {})",
|
||||
req_tenant.id,
|
||||
req_tenant.gen,
|
||||
tenant_state.generation
|
||||
);
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: req_tenant.id,
|
||||
valid,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
|
||||
/// (in the real control plane this is unnecessary, because the same program is managing
|
||||
/// generation numbers and doing attachments).
|
||||
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req).inner.clone();
|
||||
let mut locked = state.write().await;
|
||||
|
||||
let tenant_state = locked
|
||||
.tenants
|
||||
.entry(attach_req.tenant_id)
|
||||
.or_insert_with(|| TenantState {
|
||||
pageserver: attach_req.node_id,
|
||||
generation: 0,
|
||||
});
|
||||
|
||||
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
|
||||
tenant_state.generation += 1;
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
ps_id = %attaching_pageserver,
|
||||
generation = %tenant_state.generation,
|
||||
"issuing",
|
||||
);
|
||||
} else if let Some(ps_id) = tenant_state.pageserver {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
%ps_id,
|
||||
generation = %tenant_state.generation,
|
||||
"dropping",
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
"no-op: tenant already has no pageserver");
|
||||
}
|
||||
tenant_state.pageserver = attach_req.node_id;
|
||||
let generation = tenant_state.generation;
|
||||
|
||||
tracing::info!(
|
||||
"handle_attach_hook: tenant {} set generation {}, pageserver {}",
|
||||
attach_req.tenant_id,
|
||||
tenant_state.generation,
|
||||
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
|
||||
);
|
||||
|
||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
AttachHookResponse {
|
||||
gen: attach_req.node_id.map(|_| generation),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req).inner.clone();
|
||||
let locked = state.write().await;
|
||||
let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
InspectResponse {
|
||||
attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
endpoint::make_router()
|
||||
.data(Arc::new(State::new(persistent_state)))
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
||||
.post("/inspect", |r| request_span(r, handle_inspect))
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
let args = Cli::parse();
|
||||
tracing::info!(
|
||||
"Starting, state at {}, listening on {}",
|
||||
args.path.to_string_lossy(),
|
||||
args.listen
|
||||
);
|
||||
|
||||
let persistent_state = PersistentState::load_or_new(&args.path).await;
|
||||
|
||||
let http_listener = tcp_listener::bind(args.listen)?;
|
||||
let router = make_router(persistent_state)
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
|
||||
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
|
||||
tokio::task::spawn(server);
|
||||
|
||||
ShutdownSignals::handle(|signal| match signal {
|
||||
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
|
||||
tracing::info!("Got {}. Terminating", signal.name());
|
||||
// We're just a test helper: no graceful shutdown.
|
||||
std::process::exit(0);
|
||||
}
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -8,14 +8,18 @@
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::attachment_service::AttachmentService;
|
||||
use control_plane::attachment_service::{
|
||||
AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
|
||||
};
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::tenant_migration::migrate_tenant;
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::models::TimelineInfo;
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||
use pageserver_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
||||
@@ -30,6 +34,7 @@ use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
||||
use url::Host;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -276,10 +281,10 @@ fn print_timeline(
|
||||
/// Connects to the pageserver to query this information.
|
||||
async fn get_timeline_infos(
|
||||
env: &local_env::LocalEnv,
|
||||
tenant_id: &TenantId,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Result<HashMap<TimelineId, TimelineInfo>> {
|
||||
Ok(get_default_pageserver(env)
|
||||
.timeline_list(tenant_id)
|
||||
.timeline_list(tenant_shard_id)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|timeline_info| (timeline_info.timeline_id, timeline_info))
|
||||
@@ -297,6 +302,20 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to parse --tenant_id option, for commands that accept a shard suffix
|
||||
fn get_tenant_shard_id(
|
||||
sub_match: &ArgMatches,
|
||||
env: &local_env::LocalEnv,
|
||||
) -> anyhow::Result<TenantShardId> {
|
||||
if let Some(tenant_id_from_arguments) = parse_tenant_shard_id(sub_match).transpose() {
|
||||
tenant_id_from_arguments
|
||||
} else if let Some(default_id) = env.default_tenant_id {
|
||||
Ok(TenantShardId::unsharded(default_id))
|
||||
} else {
|
||||
anyhow::bail!("No tenant shard id. Use --tenant-id, or set a default tenant");
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
|
||||
sub_match
|
||||
.get_one::<String>("tenant-id")
|
||||
@@ -305,6 +324,14 @@ fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
|
||||
.context("Failed to parse tenant id from the argument string")
|
||||
}
|
||||
|
||||
fn parse_tenant_shard_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantShardId>> {
|
||||
sub_match
|
||||
.get_one::<String>("tenant-id")
|
||||
.map(|id_str| TenantShardId::from_str(id_str))
|
||||
.transpose()
|
||||
.context("Failed to parse tenant shard id from the argument string")
|
||||
}
|
||||
|
||||
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
|
||||
sub_match
|
||||
.get_one::<String>("timeline-id")
|
||||
@@ -393,47 +420,66 @@ async fn handle_tenant(
|
||||
Some(("create", create_match)) => {
|
||||
let tenant_conf: HashMap<_, _> = create_match
|
||||
.get_many::<String>("config")
|
||||
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
|
||||
.map(|vals: clap::parser::ValuesRef<'_, String>| {
|
||||
vals.flat_map(|c| c.split_once(':')).collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
let shard_count: u8 = create_match
|
||||
.get_one::<u8>("shard-count")
|
||||
.cloned()
|
||||
.unwrap_or(0);
|
||||
|
||||
let shard_stripe_size: Option<u32> =
|
||||
create_match.get_one::<u32>("shard-stripe-size").cloned();
|
||||
|
||||
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
|
||||
|
||||
// If tenant ID was not specified, generate one
|
||||
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
|
||||
|
||||
let generation = if env.control_plane_api.is_some() {
|
||||
// We must register the tenant with the attachment service, so
|
||||
// that when the pageserver restarts, it will be re-attached.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.attach_hook(tenant_id, pageserver.conf.id)
|
||||
.await?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
pageserver
|
||||
.tenant_create(tenant_id, generation, tenant_conf)
|
||||
// We must register the tenant with the attachment service, so
|
||||
// that when the pageserver restarts, it will be re-attached.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.tenant_create(TenantCreateRequest {
|
||||
// Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
|
||||
// attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
|
||||
// type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters {
|
||||
count: ShardCount(shard_count),
|
||||
stripe_size: shard_stripe_size.map(ShardStripeSize),
|
||||
},
|
||||
config: tenant_conf,
|
||||
})
|
||||
.await?;
|
||||
println!("tenant {tenant_id} successfully created on the pageserver");
|
||||
|
||||
// Create an initial timeline for the new tenant
|
||||
let new_timeline_id = parse_timeline_id(create_match)?;
|
||||
let new_timeline_id =
|
||||
parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate());
|
||||
let pg_version = create_match
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let timeline_info = pageserver
|
||||
.timeline_create(
|
||||
// FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
|
||||
// different shards picking different start lsns. Maybe we have to teach attachment service
|
||||
// to let shard 0 branch first and then propagate the chosen LSN to other shards.
|
||||
attachment_service
|
||||
.tenant_timeline_create(
|
||||
tenant_id,
|
||||
new_timeline_id,
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
None,
|
||||
TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: None,
|
||||
ancestor_start_lsn: None,
|
||||
existing_initdb_timeline_id: None,
|
||||
pg_version: Some(pg_version),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
|
||||
env.register_branch_mapping(
|
||||
DEFAULT_BRANCH_NAME.to_string(),
|
||||
@@ -441,9 +487,7 @@ async fn handle_tenant(
|
||||
new_timeline_id,
|
||||
)?;
|
||||
|
||||
println!(
|
||||
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
|
||||
);
|
||||
println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",);
|
||||
|
||||
if create_match.get_flag("set-default") {
|
||||
println!("Setting tenant {tenant_id} as a default one");
|
||||
@@ -470,12 +514,82 @@ async fn handle_tenant(
|
||||
println!("tenant {tenant_id} successfully configured on the pageserver");
|
||||
}
|
||||
Some(("migrate", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
let tenant_shard_id = get_tenant_shard_id(matches, env)?;
|
||||
let new_pageserver = get_pageserver(env, matches)?;
|
||||
let new_pageserver_id = new_pageserver.conf.id;
|
||||
|
||||
migrate_tenant(env, tenant_id, new_pageserver).await?;
|
||||
println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.tenant_migrate(tenant_shard_id, new_pageserver_id)
|
||||
.await?;
|
||||
|
||||
println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
|
||||
}
|
||||
Some(("status", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
|
||||
let mut shard_table = comfy_table::Table::new();
|
||||
shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
|
||||
|
||||
let mut tenant_synthetic_size = None;
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
|
||||
|
||||
let size = pageserver
|
||||
.http_client
|
||||
.tenant_details(shard.shard_id)
|
||||
.await?
|
||||
.tenant_info
|
||||
.current_physical_size
|
||||
.unwrap();
|
||||
|
||||
shard_table.add_row([
|
||||
format!("{}", shard.shard_id.shard_slug()),
|
||||
format!("{}", shard.node_id.0),
|
||||
format!("{} MiB", size / (1024 * 1024)),
|
||||
]);
|
||||
|
||||
if shard.shard_id.is_zero() {
|
||||
tenant_synthetic_size =
|
||||
Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
|
||||
}
|
||||
}
|
||||
|
||||
let Some(synthetic_size) = tenant_synthetic_size else {
|
||||
bail!("Shard 0 not found")
|
||||
};
|
||||
|
||||
let mut tenant_table = comfy_table::Table::new();
|
||||
tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
|
||||
tenant_table.add_row([
|
||||
"Synthetic size".to_string(),
|
||||
format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
|
||||
]);
|
||||
|
||||
println!("{tenant_table}");
|
||||
println!("{shard_table}");
|
||||
}
|
||||
Some(("shard-split", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let result = attachment_service
|
||||
.tenant_split(tenant_id, shard_count)
|
||||
.await?;
|
||||
println!(
|
||||
"Split tenant {} into shards {}",
|
||||
tenant_id,
|
||||
result
|
||||
.new_shards
|
||||
.iter()
|
||||
.map(|s| format!("{:?}", s))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
}
|
||||
|
||||
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
||||
@@ -489,8 +603,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
|
||||
match timeline_match.subcommand() {
|
||||
Some(("list", list_match)) => {
|
||||
let tenant_id = get_tenant_id(list_match, env)?;
|
||||
let timelines = pageserver.timeline_list(&tenant_id).await?;
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
||||
// where shard 0 is attached, and query there.
|
||||
let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
|
||||
let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
|
||||
print_timelines_tree(timelines, env.timeline_name_mappings())?;
|
||||
}
|
||||
Some(("create", create_match)) => {
|
||||
@@ -505,18 +621,19 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
||||
let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
|
||||
|
||||
let timeline_info = pageserver
|
||||
.timeline_create(
|
||||
tenant_id,
|
||||
new_timeline_id_opt,
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
None,
|
||||
)
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: None,
|
||||
existing_initdb_timeline_id: None,
|
||||
ancestor_start_lsn: None,
|
||||
pg_version: Some(pg_version),
|
||||
};
|
||||
let timeline_info = attachment_service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
|
||||
@@ -574,7 +691,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
None,
|
||||
pg_version,
|
||||
ComputeMode::Primary,
|
||||
DEFAULT_PAGESERVER_ID,
|
||||
)?;
|
||||
println!("Done");
|
||||
}
|
||||
@@ -598,17 +714,18 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||
.transpose()
|
||||
.context("Failed to parse ancestor start Lsn from the request")?;
|
||||
let timeline_info = pageserver
|
||||
.timeline_create(
|
||||
tenant_id,
|
||||
None,
|
||||
start_lsn,
|
||||
Some(ancestor_timeline_id),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
let new_timeline_id = TimelineId::generate();
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: Some(ancestor_timeline_id),
|
||||
existing_initdb_timeline_id: None,
|
||||
ancestor_start_lsn: start_lsn,
|
||||
pg_version: None,
|
||||
};
|
||||
let timeline_info = attachment_service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
|
||||
@@ -635,8 +752,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
|
||||
match sub_name {
|
||||
"list" => {
|
||||
let tenant_id = get_tenant_id(sub_args, env)?;
|
||||
let timeline_infos = get_timeline_infos(env, &tenant_id)
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
||||
// where shard 0 is attached, and query there.
|
||||
let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
|
||||
let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
|
||||
.await
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("Failed to load timeline info: {}", e);
|
||||
@@ -661,7 +780,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
for (endpoint_id, endpoint) in cplane
|
||||
.endpoints
|
||||
.iter()
|
||||
.filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
|
||||
.filter(|(_, endpoint)| endpoint.tenant_id == tenant_shard_id.tenant_id)
|
||||
{
|
||||
let lsn_str = match endpoint.mode {
|
||||
ComputeMode::Static(lsn) => {
|
||||
@@ -680,7 +799,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
};
|
||||
|
||||
let branch_name = timeline_name_mappings
|
||||
.get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id))
|
||||
.get(&TenantTimelineId::new(
|
||||
tenant_shard_id.tenant_id,
|
||||
endpoint.timeline_id,
|
||||
))
|
||||
.map(|name| name.as_str())
|
||||
.unwrap_or("?");
|
||||
|
||||
@@ -728,13 +850,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.copied()
|
||||
.unwrap_or(false);
|
||||
|
||||
let pageserver_id =
|
||||
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
||||
NodeId(id_str.parse().context("while parsing pageserver id")?)
|
||||
} else {
|
||||
DEFAULT_PAGESERVER_ID
|
||||
};
|
||||
|
||||
let mode = match (lsn, hot_standby) {
|
||||
(Some(lsn), false) => ComputeMode::Static(lsn),
|
||||
(None, true) => ComputeMode::Replica,
|
||||
@@ -762,7 +877,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
http_port,
|
||||
pg_version,
|
||||
mode,
|
||||
pageserver_id,
|
||||
)?;
|
||||
}
|
||||
"start" => {
|
||||
@@ -805,6 +919,22 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
endpoint.timeline_id,
|
||||
)?;
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
|
||||
let pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
assert!(!pageservers.is_empty());
|
||||
let stripe_size = locate_result.shard_params.stripe_size.map(|s| s.0 as usize);
|
||||
|
||||
let ps_conf = env.get_pageserver_conf(pageserver_id)?;
|
||||
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
|
||||
let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);
|
||||
@@ -816,7 +946,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint
|
||||
.start(&auth_token, safekeepers, remote_ext_config)
|
||||
.start(
|
||||
&auth_token,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
remote_ext_config,
|
||||
stripe_size,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
"reconfigure" => {
|
||||
@@ -827,15 +963,31 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageserver_id =
|
||||
let pageservers =
|
||||
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
||||
Some(NodeId(
|
||||
id_str.parse().context("while parsing pageserver id")?,
|
||||
))
|
||||
let ps_id = NodeId(id_str.parse().context("while parsing pageserver id")?);
|
||||
let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
|
||||
vec![(
|
||||
pageserver.pg_connection_config.host().clone(),
|
||||
pageserver.pg_connection_config.port(),
|
||||
)]
|
||||
} else {
|
||||
None
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.tenant_locate(endpoint.tenant_id)
|
||||
.await?
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported malformed host"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
endpoint.reconfigure(pageserver_id).await?;
|
||||
endpoint.reconfigure(pageservers).await?;
|
||||
}
|
||||
"stop" => {
|
||||
let endpoint_id = sub_args
|
||||
@@ -959,6 +1111,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
}
|
||||
|
||||
Some(("set-state", subcommand_args)) => {
|
||||
let pageserver = get_pageserver(env, subcommand_args)?;
|
||||
let scheduling = subcommand_args.get_one("scheduling");
|
||||
let availability = subcommand_args.get_one("availability");
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.node_configure(NodeConfigureRequest {
|
||||
node_id: pageserver.conf.id,
|
||||
scheduling: scheduling.cloned(),
|
||||
availability: availability.cloned(),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
Some(("status", subcommand_args)) => {
|
||||
match get_pageserver(env, subcommand_args)?.check_status().await {
|
||||
Ok(_) => println!("Page server is up and running"),
|
||||
@@ -1352,6 +1519,8 @@ fn cli() -> Command {
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
|
||||
.help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
|
||||
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
||||
.arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
|
||||
)
|
||||
.subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
|
||||
.about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
|
||||
@@ -1362,6 +1531,14 @@ fn cli() -> Command {
|
||||
.about("Migrate a tenant from one pageserver to another")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(pageserver_id_arg.clone()))
|
||||
.subcommand(Command::new("status")
|
||||
.about("Human readable summary of the tenant's shards and attachment locations")
|
||||
.arg(tenant_id_arg.clone()))
|
||||
.subcommand(Command::new("shard-split")
|
||||
.about("Increase the number of shards in the tenant")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("pageserver")
|
||||
@@ -1381,6 +1558,12 @@ fn cli() -> Command {
|
||||
.about("Restart local pageserver")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
.subcommand(Command::new("set-state")
|
||||
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
|
||||
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
|
||||
.about("Set scheduling or availability state of pageserver node")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("attachment_service")
|
||||
|
||||
@@ -49,10 +49,11 @@ use compute_api::spec::RemoteExtSpec;
|
||||
use nix::sys::signal::kill;
|
||||
use nix::sys::signal::Signal;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use url::Host;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::attachment_service::AttachmentService;
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::pageserver::PageServerNode;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
|
||||
use compute_api::responses::{ComputeState, ComputeStatus};
|
||||
@@ -69,7 +70,6 @@ pub struct EndpointConf {
|
||||
http_port: u16,
|
||||
pg_version: u32,
|
||||
skip_pg_catalog_updates: bool,
|
||||
pageserver_id: NodeId,
|
||||
}
|
||||
|
||||
//
|
||||
@@ -121,19 +121,14 @@ impl ComputeControlPlane {
|
||||
http_port: Option<u16>,
|
||||
pg_version: u32,
|
||||
mode: ComputeMode,
|
||||
pageserver_id: NodeId,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
let pageserver =
|
||||
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
||||
|
||||
let ep = Arc::new(Endpoint {
|
||||
endpoint_id: endpoint_id.to_owned(),
|
||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
|
||||
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
|
||||
env: self.env.clone(),
|
||||
pageserver,
|
||||
timeline_id,
|
||||
mode,
|
||||
tenant_id,
|
||||
@@ -159,7 +154,6 @@ impl ComputeControlPlane {
|
||||
pg_port,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates: true,
|
||||
pageserver_id,
|
||||
})?,
|
||||
)?;
|
||||
std::fs::write(
|
||||
@@ -218,7 +212,6 @@ pub struct Endpoint {
|
||||
// These are not part of the endpoint as such, but the environment
|
||||
// the endpoint runs in.
|
||||
pub env: LocalEnv,
|
||||
pageserver: PageServerNode,
|
||||
|
||||
// Optimizations
|
||||
skip_pg_catalog_updates: bool,
|
||||
@@ -241,15 +234,11 @@ impl Endpoint {
|
||||
let conf: EndpointConf =
|
||||
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
|
||||
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
|
||||
|
||||
Ok(Endpoint {
|
||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
|
||||
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
|
||||
endpoint_id,
|
||||
env: env.clone(),
|
||||
pageserver,
|
||||
timeline_id: conf.timeline_id,
|
||||
mode: conf.mode,
|
||||
tenant_id: conf.tenant_id,
|
||||
@@ -469,11 +458,21 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
|
||||
pageservers
|
||||
.iter()
|
||||
.map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub async fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
pageservers: Vec<(Host, u16)>,
|
||||
remote_ext_config: Option<&String>,
|
||||
shard_stripe_size: Option<usize>,
|
||||
) -> Result<()> {
|
||||
if self.status() == "running" {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
@@ -487,13 +486,9 @@ impl Endpoint {
|
||||
std::fs::remove_dir_all(self.pgdata())?;
|
||||
}
|
||||
|
||||
let pageserver_connstring = {
|
||||
let config = &self.pageserver.pg_connection_config;
|
||||
let (host, port) = (config.host(), config.port());
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
|
||||
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
|
||||
format!("postgresql://no_user@{host}:{port}")
|
||||
};
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
if self.mode == ComputeMode::Primary {
|
||||
for sk_id in safekeepers {
|
||||
@@ -543,6 +538,7 @@ impl Endpoint {
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -555,8 +551,11 @@ impl Endpoint {
|
||||
|
||||
// Launch compute_ctl
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
|
||||
cmd.args(["--http-port", &self.http_address.port().to_string()])
|
||||
let mut cmd = Command::new("/usr/bin/taskset");
|
||||
|
||||
cmd.args(["-c".to_string(), "8-11".to_string()])
|
||||
.args([self.env.neon_distrib_dir.join("compute_ctl")])
|
||||
.args(["--http-port", &self.http_address.port().to_string()])
|
||||
.args(["--pgdata", self.pgdata().to_str().unwrap()])
|
||||
.args(["--connstr", &self.connstr()])
|
||||
.args([
|
||||
@@ -665,7 +664,7 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
|
||||
pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
let file = std::fs::File::open(spec_path)?;
|
||||
@@ -675,25 +674,27 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
if let Some(pageserver_id) = pageserver_id {
|
||||
let endpoint_config_path = self.endpoint_path().join("endpoint.json");
|
||||
let mut endpoint_conf: EndpointConf = {
|
||||
let file = std::fs::File::open(&endpoint_config_path)?;
|
||||
serde_json::from_reader(file)?
|
||||
};
|
||||
endpoint_conf.pageserver_id = pageserver_id;
|
||||
std::fs::write(
|
||||
endpoint_config_path,
|
||||
serde_json::to_string_pretty(&endpoint_conf)?,
|
||||
)?;
|
||||
|
||||
let pageserver =
|
||||
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
||||
let ps_http_conf = &pageserver.pg_connection_config;
|
||||
let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
|
||||
spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
|
||||
// If we weren't given explicit pageservers, query the attachment service
|
||||
if pageservers.is_empty() {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
|
||||
pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
}
|
||||
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstr.is_empty());
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let response = client
|
||||
.post(format!(
|
||||
|
||||
@@ -14,4 +14,3 @@ pub mod local_env;
|
||||
pub mod pageserver;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
pub mod tenant_migration;
|
||||
|
||||
@@ -12,12 +12,15 @@ use std::io::Write;
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Child, Command};
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::models::{
|
||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -216,11 +219,19 @@ impl PageServerNode {
|
||||
if update_config {
|
||||
args.push(Cow::Borrowed("--update-config"));
|
||||
}
|
||||
|
||||
let mut taskset_args = vec![
|
||||
"-c".to_string(),
|
||||
format!("{}", self.conf.id.0 - 1),
|
||||
self.env.pageserver_bin().to_string_lossy().into(),
|
||||
];
|
||||
taskset_args.extend(args.into_iter().map(|a| a.to_string()));
|
||||
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
&datadir,
|
||||
&self.env.pageserver_bin(),
|
||||
args.iter().map(Cow::as_ref),
|
||||
&PathBuf::from_str("/usr/bin/taskset").unwrap(),
|
||||
taskset_args,
|
||||
self.pageserver_env_variables()?,
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
|| async {
|
||||
@@ -301,16 +312,8 @@ impl PageServerNode {
|
||||
pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
|
||||
self.http_client.list_tenants().await
|
||||
}
|
||||
|
||||
pub async fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: TenantId,
|
||||
generation: Option<u32>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<TenantId> {
|
||||
let mut settings = settings.clone();
|
||||
|
||||
let config = models::TenantConfig {
|
||||
pub fn parse_config(mut settings: HashMap<&str, &str>) -> anyhow::Result<models::TenantConfig> {
|
||||
let result = models::TenantConfig {
|
||||
checkpoint_distance: settings
|
||||
.remove("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>())
|
||||
@@ -371,11 +374,26 @@ impl PageServerNode {
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
} else {
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: TenantId,
|
||||
generation: Option<u32>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<TenantId> {
|
||||
let config = Self::parse_config(settings.clone())?;
|
||||
|
||||
let request = models::TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(new_tenant_id),
|
||||
generation,
|
||||
config,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -471,31 +489,32 @@ impl PageServerNode {
|
||||
|
||||
pub async fn location_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<Duration>,
|
||||
) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.location_config(tenant_id, config, flush_ms)
|
||||
.location_config(tenant_shard_id, config, flush_ms)
|
||||
.await?)
|
||||
}
|
||||
|
||||
pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
Ok(self.http_client.list_timelines(*tenant_id).await?)
|
||||
pub async fn timeline_list(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
||||
}
|
||||
|
||||
pub async fn timeline_create(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
new_timeline_id: Option<TimelineId>,
|
||||
new_timeline_id: TimelineId,
|
||||
ancestor_start_lsn: Option<Lsn>,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
pg_version: Option<u32>,
|
||||
existing_initdb_timeline_id: Option<TimelineId>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
// If timeline ID was not specified, generate one
|
||||
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
|
||||
let req = models::TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
@@ -581,4 +600,14 @@ impl PageServerNode {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_synthetic_size(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> anyhow::Result<TenantHistorySize> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.tenant_synthetic_size(tenant_shard_id)
|
||||
.await?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,205 +0,0 @@
|
||||
//!
|
||||
//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
|
||||
//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
|
||||
//! point to the new pageserver.
|
||||
//!
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::{
|
||||
attachment_service::AttachmentService, endpoint::ComputeControlPlane,
|
||||
pageserver::PageServerNode,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
/// Given an attached pageserver, retrieve the LSN for all timelines
|
||||
async fn get_lsns(
|
||||
tenant_id: TenantId,
|
||||
pageserver: &PageServerNode,
|
||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
||||
let timelines = pageserver.timeline_list(&tenant_id).await?;
|
||||
Ok(timelines
|
||||
.into_iter()
|
||||
.map(|t| (t.timeline_id, t.last_record_lsn))
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
|
||||
/// `baseline`.
|
||||
async fn await_lsn(
|
||||
tenant_id: TenantId,
|
||||
pageserver: &PageServerNode,
|
||||
baseline: HashMap<TimelineId, Lsn>,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let latest = match get_lsns(tenant_id, pageserver).await {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
println!(
|
||||
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
|
||||
pageserver.conf.id
|
||||
);
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let mut any_behind: bool = false;
|
||||
for (timeline_id, baseline_lsn) in &baseline {
|
||||
match latest.get(timeline_id) {
|
||||
Some(latest_lsn) => {
|
||||
println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
|
||||
if latest_lsn < baseline_lsn {
|
||||
any_behind = true;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Expected timeline isn't yet visible on migration destination.
|
||||
// (IRL we would have to account for timeline deletion, but this
|
||||
// is just test helper)
|
||||
any_behind = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !any_behind {
|
||||
println!("✅ LSN caught up. Proceeding...");
|
||||
break;
|
||||
} else {
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This function spans multiple services, to demonstrate live migration of a tenant
|
||||
/// between pageservers:
|
||||
/// - Coordinate attach/secondary/detach on pageservers
|
||||
/// - call into attachment_service for generations
|
||||
/// - reconfigure compute endpoints to point to new attached pageserver
|
||||
pub async fn migrate_tenant(
|
||||
env: &LocalEnv,
|
||||
tenant_id: TenantId,
|
||||
dest_ps: PageServerNode,
|
||||
) -> anyhow::Result<()> {
|
||||
// Get a new generation
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
|
||||
fn build_location_config(
|
||||
mode: LocationConfigMode,
|
||||
generation: Option<u32>,
|
||||
secondary_conf: Option<LocationConfigSecondary>,
|
||||
) -> LocationConfig {
|
||||
LocationConfig {
|
||||
mode,
|
||||
generation,
|
||||
secondary_conf,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
shard_number: 0,
|
||||
shard_count: 0,
|
||||
shard_stripe_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
let previous = attachment_service.inspect(tenant_id).await?;
|
||||
let mut baseline_lsns = None;
|
||||
if let Some((generation, origin_ps_id)) = &previous {
|
||||
let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
|
||||
|
||||
if origin_ps_id == &dest_ps.conf.id {
|
||||
println!("🔁 Already attached to {origin_ps_id}, freshening...");
|
||||
let gen = attachment_service
|
||||
.attach_hook(tenant_id, dest_ps.conf.id)
|
||||
.await?;
|
||||
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
|
||||
dest_ps.location_config(tenant_id, dest_conf, None).await?;
|
||||
println!("✅ Migration complete");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
|
||||
|
||||
let stale_conf =
|
||||
build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
|
||||
origin_ps
|
||||
.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
|
||||
.await?;
|
||||
|
||||
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
|
||||
}
|
||||
|
||||
let gen = attachment_service
|
||||
.attach_hook(tenant_id, dest_ps.conf.id)
|
||||
.await?;
|
||||
let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
|
||||
|
||||
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
|
||||
dest_ps.location_config(tenant_id, dest_conf, None).await?;
|
||||
|
||||
if let Some(baseline) = baseline_lsns {
|
||||
println!("🕑 Waiting for LSN to catch up...");
|
||||
await_lsn(tenant_id, &dest_ps, baseline).await?;
|
||||
}
|
||||
|
||||
let cplane = ComputeControlPlane::load(env.clone())?;
|
||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
||||
if endpoint.tenant_id == tenant_id {
|
||||
println!(
|
||||
"🔁 Reconfiguring endpoint {} to use pageserver {}",
|
||||
endpoint_name, dest_ps.conf.id
|
||||
);
|
||||
endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
|
||||
}
|
||||
}
|
||||
|
||||
for other_ps_conf in &env.pageservers {
|
||||
if other_ps_conf.id == dest_ps.conf.id {
|
||||
continue;
|
||||
}
|
||||
|
||||
let other_ps = PageServerNode::from_env(env, other_ps_conf);
|
||||
let other_ps_tenants = other_ps.tenant_list().await?;
|
||||
|
||||
// Check if this tenant is attached
|
||||
let found = other_ps_tenants
|
||||
.into_iter()
|
||||
.map(|t| t.id)
|
||||
.any(|i| i.tenant_id == tenant_id);
|
||||
if !found {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Downgrade to a secondary location
|
||||
let secondary_conf = build_location_config(
|
||||
LocationConfigMode::Secondary,
|
||||
None,
|
||||
Some(LocationConfigSecondary { warm: true }),
|
||||
);
|
||||
|
||||
println!(
|
||||
"💤 Switching to secondary mode on pageserver {}",
|
||||
other_ps.conf.id
|
||||
);
|
||||
other_ps
|
||||
.location_config(tenant_id, secondary_conf, None)
|
||||
.await?;
|
||||
}
|
||||
|
||||
println!(
|
||||
"🔁 Switching to AttachedSingle mode on pageserver {}",
|
||||
dest_ps.conf.id
|
||||
);
|
||||
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
|
||||
dest_ps.location_config(tenant_id, dest_conf, None).await?;
|
||||
|
||||
println!("✅ Migration complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -75,6 +75,9 @@ pub struct ComputeSpec {
|
||||
pub remote_extensions: Option<RemoteExtSpec>,
|
||||
|
||||
pub pgbouncer_settings: Option<HashMap<String, String>>,
|
||||
|
||||
// Stripe size for pageserver sharding, in pages
|
||||
pub shard_stripe_size: Option<usize>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
|
||||
@@ -18,7 +18,10 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{reltag::RelTag, shard::TenantShardId};
|
||||
use crate::{
|
||||
reltag::RelTag,
|
||||
shard::{ShardCount, ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
|
||||
@@ -188,6 +191,41 @@ pub struct TimelineCreateRequest {
|
||||
pub pg_version: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantShardSplitRequest {
|
||||
pub new_shard_count: u8,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantShardSplitResponse {
|
||||
pub new_shards: Vec<TenantShardId>,
|
||||
}
|
||||
|
||||
/// Parameters that apply to all shards in a tenant. Used during tenant creation.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct ShardParameters {
|
||||
pub count: ShardCount,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub stripe_size: Option<ShardStripeSize>,
|
||||
}
|
||||
|
||||
impl ShardParameters {
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.count == ShardCount(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ShardParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
count: ShardCount(0),
|
||||
stripe_size: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantCreateRequest {
|
||||
@@ -195,6 +233,12 @@ pub struct TenantCreateRequest {
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
|
||||
// If omitted, create a single shard with TenantShardId::unsharded()
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
|
||||
pub shard_parameters: ShardParameters,
|
||||
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
@@ -217,7 +261,7 @@ impl std::ops::Deref for TenantCreateRequest {
|
||||
|
||||
/// An alternative representation of `pageserver::tenant::TenantConf` with
|
||||
/// simpler types.
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
|
||||
pub struct TenantConfig {
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
pub checkpoint_timeout: Option<String>,
|
||||
@@ -246,7 +290,7 @@ pub struct TenantConfig {
|
||||
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
|
||||
/// lists out all possible states (and the virtual "Detached" state)
|
||||
/// in a flat form rather than using rust-style enums.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
||||
pub enum LocationConfigMode {
|
||||
AttachedSingle,
|
||||
AttachedMulti,
|
||||
@@ -255,14 +299,14 @@ pub enum LocationConfigMode {
|
||||
Detached,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
||||
pub struct LocationConfigSecondary {
|
||||
pub warm: bool,
|
||||
}
|
||||
|
||||
/// An alternative representation of `pageserver::tenant::LocationConf`,
|
||||
/// for use in external-facing APIs.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
||||
pub struct LocationConfig {
|
||||
pub mode: LocationConfigMode,
|
||||
/// If attaching, in what generation?
|
||||
@@ -297,7 +341,7 @@ pub struct StatusResponse {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub tenant_id: TenantShardId,
|
||||
#[serde(flatten)]
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
@@ -658,6 +702,17 @@ pub struct PagestreamDbSizeResponse {
|
||||
pub db_size: i64,
|
||||
}
|
||||
|
||||
// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
|
||||
// that require pageserver-internal types. It is sufficient to get the total size.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantHistorySize {
|
||||
pub id: TenantId,
|
||||
/// Size is a mixture of WAL and logical size, so the unit is bytes.
|
||||
///
|
||||
/// Will be none if `?inputs_only=true` was given.
|
||||
pub size: Option<u64>,
|
||||
}
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
use std::{ops::RangeInclusive, str::FromStr};
|
||||
|
||||
use crate::key::{is_rel_block_key, Key};
|
||||
use crate::{
|
||||
key::{is_rel_block_key, Key},
|
||||
models::ShardParameters,
|
||||
};
|
||||
use hex::FromHex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror;
|
||||
@@ -85,6 +88,16 @@ impl TenantShardId {
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
|
||||
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
|
||||
/// is useful when logging from code that is already in a span that includes tenant ID, to
|
||||
/// keep messages reasonably terse.
|
||||
pub fn to_index(&self) -> ShardIndex {
|
||||
ShardIndex {
|
||||
shard_number: self.shard_number,
|
||||
shard_count: self.shard_count,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Formatting helper
|
||||
@@ -333,7 +346,7 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
pub struct ShardIdentity {
|
||||
pub number: ShardNumber,
|
||||
pub count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
@@ -403,6 +416,17 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// For use when creating ShardIdentity instances for new shards, where a creation request
|
||||
/// specifies the ShardParameters that apply to all shards.
|
||||
pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self {
|
||||
Self {
|
||||
number,
|
||||
count: params.count,
|
||||
layout: LAYOUT_V1,
|
||||
stripe_size: params.stripe_size.unwrap_or(DEFAULT_STRIPE_SIZE),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_broken(&self) -> bool {
|
||||
self.layout == LAYOUT_BROKEN
|
||||
}
|
||||
|
||||
@@ -31,6 +31,9 @@ pub enum ApiError {
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown,
|
||||
|
||||
#[error("Timeout")]
|
||||
Timeout(Cow<'static, str>),
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(anyhow::Error),
|
||||
}
|
||||
@@ -67,6 +70,10 @@ impl ApiError {
|
||||
err.to_string(),
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
),
|
||||
ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::REQUEST_TIMEOUT,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::num::ParseIntError;
|
||||
use std::{fmt, str::FromStr};
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -374,6 +375,13 @@ impl fmt::Display for NodeId {
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for NodeId {
|
||||
type Err = ParseIntError;
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(NodeId(u64::from_str(s)?))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_assert::{Deserializer, Serializer, Token, Tokens};
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use pageserver_api::models::*;
|
||||
use pageserver_api::{models::*, shard::TenantShardId};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use utils::{
|
||||
http::error::HttpErrorBody,
|
||||
@@ -68,9 +68,9 @@ impl Client {
|
||||
|
||||
pub async fn tenant_details(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<pageserver_api::models::TenantDetails> {
|
||||
let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint);
|
||||
let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
|
||||
self.get(uri)
|
||||
.await?
|
||||
.json()
|
||||
@@ -80,9 +80,12 @@ impl Client {
|
||||
|
||||
pub async fn list_timelines(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
|
||||
let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
self.get(&uri)
|
||||
.await?
|
||||
.json()
|
||||
@@ -166,14 +169,17 @@ impl Client {
|
||||
|
||||
pub async fn location_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<std::time::Duration>,
|
||||
) -> Result<()> {
|
||||
let req_body = TenantLocationConfigRequest { tenant_id, config };
|
||||
let req_body = TenantLocationConfigRequest {
|
||||
tenant_id: tenant_shard_id,
|
||||
config,
|
||||
};
|
||||
let path = format!(
|
||||
"{}/v1/tenant/{}/location_config",
|
||||
self.mgmt_api_endpoint, tenant_id
|
||||
self.mgmt_api_endpoint, tenant_shard_id
|
||||
);
|
||||
let path = if let Some(flush_ms) = flush_ms {
|
||||
format!("{}?flush_ms={}", path, flush_ms.as_millis())
|
||||
@@ -199,4 +205,19 @@ impl Client {
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn tenant_synthetic_size(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<TenantHistorySize> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/synthetic_size",
|
||||
self.mgmt_api_endpoint, tenant_shard_id
|
||||
);
|
||||
self.get(&uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::id::{TenantId, TenantTimelineId};
|
||||
|
||||
@@ -31,7 +32,10 @@ pub async fn get_pageserver_tenant_timelines_unsharded(
|
||||
async move {
|
||||
(
|
||||
tenant_id,
|
||||
mgmt_api_client.tenant_details(tenant_id).await.unwrap(),
|
||||
mgmt_api_client
|
||||
.tenant_details(TenantShardId::unsharded(tenant_id))
|
||||
.await
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
});
|
||||
|
||||
@@ -15,10 +15,13 @@ use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantShardSplitRequest;
|
||||
use pageserver_api::models::TenantShardSplitResponse;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||
TenantLoadRequest, TenantLocationConfigRequest,
|
||||
};
|
||||
use pageserver_api::shard::ShardCount;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
@@ -257,7 +260,7 @@ impl From<SetNewTenantConfigError> for ApiError {
|
||||
SetNewTenantConfigError::GetTenant(tid) => {
|
||||
ApiError::NotFound(anyhow!("tenant {}", tid).into())
|
||||
}
|
||||
e @ SetNewTenantConfigError::Persist(_) => {
|
||||
e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
||||
}
|
||||
}
|
||||
@@ -987,6 +990,25 @@ async fn tenant_size_handler(
|
||||
)
|
||||
}
|
||||
|
||||
async fn tenant_shard_split_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let req: TenantShardSplitRequest = json_request(&mut request).await?;
|
||||
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let state = get_state(&request);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let new_shards = state
|
||||
.tenant_manager
|
||||
.shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, TenantShardSplitResponse { new_shards })
|
||||
}
|
||||
|
||||
async fn layer_map_info_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1151,6 +1173,7 @@ async fn tenant_create_handler(
|
||||
state.conf,
|
||||
tenant_conf,
|
||||
target_tenant_id,
|
||||
request_data.shard_parameters,
|
||||
generation,
|
||||
state.tenant_resources(),
|
||||
&ctx,
|
||||
@@ -1796,6 +1819,9 @@ pub fn make_router(
|
||||
.put("/v1/tenant/config", |r| {
|
||||
api_handler(r, update_tenant_config_handler)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_shard_id/shard_split", |r| {
|
||||
api_handler(r, tenant_shard_split_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_shard_id/config", |r| {
|
||||
api_handler(r, get_tenant_config_handler)
|
||||
})
|
||||
|
||||
@@ -17,6 +17,7 @@ use enumset::EnumSet;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -50,6 +51,7 @@ use self::metadata::TimelineMetadata;
|
||||
use self::mgr::GetActiveTenantError;
|
||||
use self::mgr::GetTenantError;
|
||||
use self::mgr::TenantsMap;
|
||||
use self::remote_timeline_client::upload::upload_index_part;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineExclusionError;
|
||||
use self::timeline::uninit::TimelineUninitMark;
|
||||
@@ -1530,6 +1532,7 @@ impl Tenant {
|
||||
})?;
|
||||
|
||||
if active_only && !timeline.is_active() {
|
||||
tracing::warn!("Timeline {} is not active", timeline.timeline_id);
|
||||
Err(GetTimelineError::NotActive {
|
||||
tenant_id: self.tenant_shard_id.tenant_id,
|
||||
timeline_id,
|
||||
@@ -2303,6 +2306,66 @@ impl Tenant {
|
||||
pub(crate) fn get_generation(&self) -> Generation {
|
||||
self.generation
|
||||
}
|
||||
|
||||
pub(crate) async fn split_prepare(
|
||||
&self,
|
||||
child_shards: &Vec<TenantShardId>,
|
||||
) -> anyhow::Result<()> {
|
||||
let timelines = self.timelines.lock().unwrap().clone();
|
||||
for timeline in timelines.values() {
|
||||
let Some(tl_client) = &timeline.remote_client else {
|
||||
anyhow::bail!("Remote storage is mandatory");
|
||||
};
|
||||
|
||||
let Some(remote_storage) = &self.remote_storage else {
|
||||
anyhow::bail!("Remote storage is mandatory");
|
||||
};
|
||||
|
||||
// TODO: some higher level should enforce that timeline creation/deletion does not
|
||||
// happen concurrently with splits. This is impossible to safely coordinate locally
|
||||
// within one single pageserver's view of the world.
|
||||
|
||||
// Upload an index from the parent: this is partly to provide freshness for the
|
||||
// child tenants that will copy it, and partly for general ease-of-debugging: there will
|
||||
// always be a parent shard index in the same generation as we wrote the child shard index.
|
||||
tl_client.schedule_index_upload_for_file_changes()?;
|
||||
tl_client.wait_completion().await?;
|
||||
|
||||
// Shut down the timeline's remote client: this means that the indices we write
|
||||
// for child shards will not be invalidated by the parent shard deleting layers.
|
||||
tl_client.shutdown().await?;
|
||||
|
||||
// Download methods can still be used after shutdown, as they don't flow through the remote client's
|
||||
// queue.
|
||||
// TODO: create a way for remote timeline client to give us a copy of the last IndexPart it uploaded
|
||||
// without having to download it again.
|
||||
// TODO: carry a cancellation token in here
|
||||
let result = tl_client
|
||||
.download_index_file(CancellationToken::new())
|
||||
.instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
|
||||
.await?;
|
||||
let index_part = match result {
|
||||
MaybeDeletedIndexPart::Deleted(_) => {
|
||||
anyhow::bail!("Timeline deletion happened concurrently with split")
|
||||
}
|
||||
MaybeDeletedIndexPart::IndexPart(p) => p,
|
||||
};
|
||||
|
||||
for child_shard in child_shards {
|
||||
upload_index_part(
|
||||
remote_storage,
|
||||
child_shard,
|
||||
&timeline.timeline_id,
|
||||
self.generation,
|
||||
&index_part,
|
||||
&self.cancel,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
|
||||
@@ -2646,10 +2709,11 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
// Legacy configs are implicitly in attached state
|
||||
// Legacy configs are implicitly in attached state, and do not support sharding
|
||||
Ok(LocationConf::attached_single(
|
||||
tenant_conf,
|
||||
Generation::none(),
|
||||
&ShardParameters::default(),
|
||||
))
|
||||
} else {
|
||||
// FIXME If the config file is not found, assume that we're attaching
|
||||
@@ -3222,43 +3286,45 @@ impl Tenant {
|
||||
|
||||
// Upload the created data dir to S3
|
||||
if let Some(storage) = &self.remote_storage {
|
||||
let temp_path = timelines_path.join(format!(
|
||||
"{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
|
||||
));
|
||||
if self.tenant_shard_id().is_zero() {
|
||||
let temp_path = timelines_path.join(format!(
|
||||
"{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
|
||||
));
|
||||
|
||||
let (pgdata_zstd, tar_zst_size) =
|
||||
import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
|
||||
backoff::retry(
|
||||
|| async {
|
||||
self::remote_timeline_client::upload_initdb_dir(
|
||||
storage,
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&timeline_id,
|
||||
pgdata_zstd.try_clone().await?,
|
||||
tar_zst_size,
|
||||
&self.cancel,
|
||||
)
|
||||
let (pgdata_zstd, tar_zst_size) =
|
||||
import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
|
||||
backoff::retry(
|
||||
|| async {
|
||||
self::remote_timeline_client::upload_initdb_dir(
|
||||
storage,
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&timeline_id,
|
||||
pgdata_zstd.try_clone().await?,
|
||||
tar_zst_size,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
u32::MAX,
|
||||
"persist_initdb_tar_zst",
|
||||
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
|
||||
)
|
||||
.await?;
|
||||
|
||||
tokio::fs::remove_file(&temp_path)
|
||||
.await
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
u32::MAX,
|
||||
"persist_initdb_tar_zst",
|
||||
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
|
||||
)
|
||||
.await?;
|
||||
|
||||
tokio::fs::remove_file(&temp_path)
|
||||
.await
|
||||
.or_else(|e| {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// If something else already removed the file, ignore the error
|
||||
Ok(())
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
})
|
||||
.with_context(|| format!("tempfile removal {temp_path}"))?;
|
||||
.or_else(|e| {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// If something else already removed the file, ignore the error
|
||||
Ok(())
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
})
|
||||
.with_context(|| format!("tempfile removal {temp_path}"))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
|
||||
@@ -3616,6 +3682,10 @@ impl Tenant {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
|
||||
self.tenant_conf.read().unwrap().tenant_conf
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_timeline_and_uninit_mark(
|
||||
@@ -4057,6 +4127,7 @@ pub(crate) mod harness {
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt::from(self.tenant_conf),
|
||||
self.generation,
|
||||
&ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
// This is a legacy/test code path: sharding isn't supported here.
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
use anyhow::bail;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::{self, ShardParameters};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
use serde::de::IntoDeserializer;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -167,14 +167,17 @@ impl LocationConf {
|
||||
/// For use when loading from a legacy configuration: presence of a tenant
|
||||
/// implies it is in AttachmentMode::Single, which used to be the only
|
||||
/// possible state. This function should eventually be removed.
|
||||
pub(crate) fn attached_single(tenant_conf: TenantConfOpt, generation: Generation) -> Self {
|
||||
pub(crate) fn attached_single(
|
||||
tenant_conf: TenantConfOpt,
|
||||
generation: Generation,
|
||||
shard_params: &ShardParameters,
|
||||
) -> Self {
|
||||
Self {
|
||||
mode: LocationMode::Attached(AttachedLocationConfig {
|
||||
generation,
|
||||
attach_mode: AttachmentMode::Single,
|
||||
}),
|
||||
// Legacy configuration loads are always from tenants created before sharding existed.
|
||||
shard: ShardIdentity::unsharded(),
|
||||
shard: ShardIdentity::from_params(ShardNumber(0), shard_params),
|
||||
tenant_conf,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,8 +2,10 @@
|
||||
//! page server.
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
@@ -20,7 +22,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::crashsafe;
|
||||
use utils::{completion, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
@@ -618,8 +620,6 @@ pub(crate) async fn shutdown_all_tenants() {
|
||||
}
|
||||
|
||||
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
use utils::completion;
|
||||
|
||||
let mut join_set = JoinSet::new();
|
||||
|
||||
// Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
|
||||
@@ -745,13 +745,21 @@ pub(crate) async fn create_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_shard_id: TenantShardId,
|
||||
shard_params: ShardParameters,
|
||||
generation: Generation,
|
||||
resources: TenantSharedResources,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Tenant>, TenantMapInsertError> {
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation);
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
|
||||
|
||||
info!("Creating tenant at location {location_conf:?}");
|
||||
|
||||
if shard_params.count != ShardCount(1) {
|
||||
return Err(TenantMapInsertError::Other(anyhow::anyhow!(
|
||||
"Only single-shard tenant creations may be serviced directly by a pageserver"
|
||||
)));
|
||||
}
|
||||
|
||||
let slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
||||
let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
|
||||
@@ -786,6 +794,8 @@ pub(crate) enum SetNewTenantConfigError {
|
||||
GetTenant(#[from] GetTenantError),
|
||||
#[error(transparent)]
|
||||
Persist(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
pub(crate) async fn set_new_tenant_config(
|
||||
@@ -799,10 +809,21 @@ pub(crate) async fn set_new_tenant_config(
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if tenant.tenant_shard_id().shard_count > ShardCount(0) {
|
||||
// Note that we use ShardParameters::default below.
|
||||
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
|
||||
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
|
||||
)));
|
||||
}
|
||||
|
||||
// This is a legacy API that only operates on attached tenants: the preferred
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf,
|
||||
tenant.generation,
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
@@ -1158,6 +1179,163 @@ impl TenantManager {
|
||||
slot_guard.revert();
|
||||
result
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
|
||||
pub(crate) async fn shard_split(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
// Plan: identify what the new child shards will be
|
||||
let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
|
||||
if new_shard_count <= ShardCount(effective_old_shard_count) {
|
||||
anyhow::bail!("Requested shard count is not an increase");
|
||||
}
|
||||
let expansion_factor = new_shard_count.0 / effective_old_shard_count;
|
||||
if expansion_factor & (expansion_factor - 1) != 0 {
|
||||
anyhow::bail!("Requested split is not a power of two");
|
||||
}
|
||||
|
||||
// Key mapping is based on a round robin mapping of key hash modulo shard count,
|
||||
// so our child shards are the ones which the same keys would map to.
|
||||
let mut child_shards = Vec::new();
|
||||
for shard_number in 0..ShardNumber(new_shard_count.0).0 {
|
||||
if shard_number % effective_old_shard_count == tenant_shard_id.shard_number.0 {
|
||||
child_shards.push(TenantShardId {
|
||||
tenant_id: tenant_shard_id.tenant_id,
|
||||
shard_number: ShardNumber(shard_number),
|
||||
shard_count: new_shard_count,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
let parent_shard_identity = tenant.shard_identity;
|
||||
let parent_tenant_conf = tenant.get_tenant_conf();
|
||||
let parent_generation = tenant.generation;
|
||||
|
||||
// TODO: write a unit test for this
|
||||
tracing::info!(
|
||||
"Shard {} splits into: {}",
|
||||
tenant_shard_id.to_index(),
|
||||
child_shards
|
||||
.iter()
|
||||
.map(|id| format!("{}", id.to_index()))
|
||||
.join(",")
|
||||
);
|
||||
|
||||
// Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
|
||||
tenant.split_prepare(&child_shards).await?;
|
||||
|
||||
self.resources.deletion_queue_client.flush_advisory();
|
||||
|
||||
// Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
|
||||
drop(tenant);
|
||||
let mut parent_slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
let parent = match parent_slot_guard.get_old_value() {
|
||||
Some(TenantSlot::Attached(t)) => t,
|
||||
Some(TenantSlot::Secondary) => anyhow::bail!("Tenant location in secondary mode"),
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
unreachable!()
|
||||
}
|
||||
None => {
|
||||
// We don't actually need the parent shard to still be attached to do our work, but it's
|
||||
// a weird enough situation that the caller probably didn't want us to continue working
|
||||
// if they had detached the tenant they requested the split on.
|
||||
anyhow::bail!("Detached parent shard in the middle of split!")
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
|
||||
// TODO: erase the dentries from the parent
|
||||
|
||||
// Take a snapshot of where the parent's WAL ingest had got to: we will wait for
|
||||
// child shards to reach this point.
|
||||
let mut target_lsns = HashMap::new();
|
||||
for timeline in parent.timelines.lock().unwrap().clone().values() {
|
||||
target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn());
|
||||
}
|
||||
|
||||
// TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources
|
||||
// and could slow down the children trying to catch up.
|
||||
|
||||
// Phase 3: Spawn the child shards
|
||||
for child_shard in &child_shards {
|
||||
let mut child_shard_identity = parent_shard_identity;
|
||||
child_shard_identity.count = child_shard.shard_count;
|
||||
child_shard_identity.number = child_shard.shard_number;
|
||||
|
||||
let child_location_conf = LocationConf {
|
||||
mode: LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: parent_generation,
|
||||
attach_mode: AttachmentMode::Single,
|
||||
}),
|
||||
shard: child_shard_identity,
|
||||
tenant_conf: parent_tenant_conf,
|
||||
};
|
||||
|
||||
self.upsert_location(*child_shard, child_location_conf, None, ctx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Phase 4: wait for child chards WAL ingest to catch up to target LSN
|
||||
for child_shard_id in &child_shards {
|
||||
let child_shard = {
|
||||
let locked = TENANTS.read().unwrap();
|
||||
let peek_slot =
|
||||
tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
|
||||
peek_slot.and_then(|s| s.get_attached()).cloned()
|
||||
};
|
||||
if let Some(t) = child_shard {
|
||||
let timelines = t.timelines.lock().unwrap().clone();
|
||||
for timeline in timelines.values() {
|
||||
let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
"Waiting for child shard {}/{} to reach target lsn {}...",
|
||||
child_shard_id,
|
||||
timeline.timeline_id,
|
||||
target_lsn
|
||||
);
|
||||
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
|
||||
// Failure here might mean shutdown, in any case this part is an optimization
|
||||
// and we shouldn't hold up the split operation.
|
||||
tracing::warn!(
|
||||
"Failed to wait for timeline {} to reach lsn {target_lsn}: {e}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
"Child shard {}/{} reached target lsn {}",
|
||||
child_shard_id,
|
||||
timeline.timeline_id,
|
||||
target_lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 5: Shut down the parent shard.
|
||||
let (_guard, progress) = completion::channel();
|
||||
match parent.shutdown(progress, false).await {
|
||||
Ok(()) => {}
|
||||
Err(other) => {
|
||||
other.wait().await;
|
||||
}
|
||||
}
|
||||
parent_slot_guard.drop_old_value()?;
|
||||
|
||||
// Phase 6: Release the InProgress on the parent shard
|
||||
drop(parent_slot_guard);
|
||||
|
||||
Ok(child_shards)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1564,10 +1742,11 @@ pub(crate) async fn attach_tenant(
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
// This is a legacy API (replaced by `/location_conf`). It does not support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let shard_params = ShardParameters::default();
|
||||
|
||||
let slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation);
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
|
||||
let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
@@ -2010,8 +2189,6 @@ async fn remove_tenant_from_memory<V, F>(
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<V>>,
|
||||
{
|
||||
use utils::completion;
|
||||
|
||||
let mut slot_guard =
|
||||
tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
|
||||
@@ -182,7 +182,7 @@
|
||||
|
||||
pub(crate) mod download;
|
||||
pub mod index;
|
||||
mod upload;
|
||||
pub(crate) mod upload;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
@@ -690,7 +690,10 @@ impl RemoteTimelineClient {
|
||||
.insert(layer.layer_desc().filename(), metadata.clone());
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
|
||||
info!("scheduled layer file upload {layer}");
|
||||
info!(
|
||||
"scheduled layer file upload {layer} gen={:?} shard={:?}",
|
||||
metadata.generation, metadata.shard
|
||||
);
|
||||
let op = UploadOp::UploadLayer(layer, metadata);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
|
||||
@@ -25,7 +25,7 @@ use super::index::LayerFileMetadata;
|
||||
use tracing::info;
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part<'a>(
|
||||
pub(crate) async fn upload_index_part<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
|
||||
@@ -308,13 +308,13 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
Assert(victim->access_count == 0);
|
||||
#ifdef FALLOC_FL_PUNCH_HOLE
|
||||
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
|
||||
elog(LOG, "Failed to punch hole in file: %m");
|
||||
neon_log(LOG, "Failed to punch hole in file: %m");
|
||||
#endif
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
lfc_ctl->used -= 1;
|
||||
}
|
||||
lfc_ctl->limit = new_size;
|
||||
elog(DEBUG1, "set local file cache limit to %d", new_size);
|
||||
neon_log(DEBUG1, "set local file cache limit to %d", new_size);
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
@@ -327,7 +327,7 @@ lfc_init(void)
|
||||
* shared_preload_libraries.
|
||||
*/
|
||||
if (!process_shared_preload_libraries_in_progress)
|
||||
elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
|
||||
neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries");
|
||||
|
||||
|
||||
DefineCustomIntVariable("neon.max_file_cache_size",
|
||||
@@ -643,7 +643,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
Assert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
elog(DEBUG2, "Swap file cache page");
|
||||
neon_log(DEBUG2, "Swap file cache page");
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -846,10 +846,10 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
* wrong) function definition though.
|
||||
*/
|
||||
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
|
||||
elog(ERROR, "return type must be a row type");
|
||||
neon_log(ERROR, "return type must be a row type");
|
||||
|
||||
if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
|
||||
elog(ERROR, "incorrect number of output arguments");
|
||||
neon_log(ERROR, "incorrect number of output arguments");
|
||||
|
||||
/* Construct a tuple descriptor for the result rows. */
|
||||
tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/xlog.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "fmgr.h"
|
||||
#include "libpq-fe.h"
|
||||
#include "libpq/libpq.h"
|
||||
@@ -38,17 +39,6 @@
|
||||
#define MIN_RECONNECT_INTERVAL_USEC 100
|
||||
#define MAX_RECONNECT_INTERVAL_USEC 1000000
|
||||
|
||||
bool connected = false;
|
||||
PGconn *pageserver_conn = NULL;
|
||||
|
||||
/*
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on pageserver_conn,
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
*/
|
||||
WaitEventSet *pageserver_conn_wes = NULL;
|
||||
|
||||
/* GUCs */
|
||||
char *neon_timeline;
|
||||
char *neon_tenant;
|
||||
@@ -59,92 +49,203 @@ char *neon_auth_token;
|
||||
int readahead_buffer_size = 128;
|
||||
int flush_every_n_requests = 8;
|
||||
|
||||
static int n_reconnect_attempts = 0;
|
||||
static int max_reconnect_attempts = 60;
|
||||
static int n_reconnect_attempts = 0;
|
||||
static int max_reconnect_attempts = 60;
|
||||
static int stripe_size;
|
||||
|
||||
#define MAX_PAGESERVER_CONNSTRING_SIZE 256
|
||||
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
|
||||
|
||||
static bool pageserver_flush(shardno_t shard_no);
|
||||
static void pageserver_disconnect(shardno_t shard_no);
|
||||
static void AssignPageserverConnstring(const char *newval, void *extra);
|
||||
static bool CheckPageserverConnstring(char **newval, void **extra, GucSource source);
|
||||
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
#if PG_VERSION_NUM>=150000
|
||||
static shmem_request_hook_type prev_shmem_request_hook;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ShardMap is kept in shared memory. It contains the connection strings for
|
||||
* each shard.
|
||||
*
|
||||
* There is "neon.pageserver_connstring" GUC with PGC_SIGHUP option, allowing to change it using
|
||||
* pg_reload_conf(). It is used by control plane to update shards information if page server is crashed,
|
||||
* relocated or new shards are added. This GUC variable contains comma separated list of connection strings.
|
||||
* It is copied to shared memory because config can not be loaded during query execution and we need to
|
||||
* reestablish connection to page server.
|
||||
*
|
||||
* Copying connection string to shared memory is done by postmaster. And other backends
|
||||
* should check update counter to determine of connection URL is changed and connection needs to be reestablished.
|
||||
* We can not use standard Postgres LW-locks, because postmaster has proc entry and so can not wait
|
||||
* on this primitive. This is why lockless access algorithm is implemented using two atomic counters to enforce
|
||||
* consistent reading of connection string value from shared memory.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
size_t n_shards;
|
||||
pg_atomic_uint64 begin_update_counter;
|
||||
pg_atomic_uint64 end_update_counter;
|
||||
char shard_connstr[MAX_SHARDS][MAX_PS_CONNSTR_LEN];
|
||||
} ShardMap;
|
||||
|
||||
|
||||
static ShardMap* shard_map;
|
||||
static uint64 shard_map_update_counter;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
LWLockId lock;
|
||||
pg_atomic_uint64 update_counter;
|
||||
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
} PagestoreShmemState;
|
||||
/*
|
||||
* Connection for each shard
|
||||
*/
|
||||
PGconn *conn;
|
||||
/*
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on 'conn'
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
*/
|
||||
WaitEventSet *wes;
|
||||
} PageServer;
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
static shmem_request_hook_type prev_shmem_request_hook = NULL;
|
||||
static void walproposer_shmem_request(void);
|
||||
static PageServer page_servers[MAX_SHARDS];
|
||||
|
||||
static void
|
||||
psm_shmem_startup(void)
|
||||
{
|
||||
bool found;
|
||||
if (prev_shmem_startup_hook)
|
||||
{
|
||||
prev_shmem_startup_hook();
|
||||
}
|
||||
|
||||
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
|
||||
|
||||
shard_map = (ShardMap*)ShmemInitStruct("shard_map", sizeof(ShardMap), &found);
|
||||
if (!found)
|
||||
{
|
||||
shard_map->n_shards = 0;
|
||||
pg_atomic_init_u64(&shard_map->begin_update_counter, 0);
|
||||
pg_atomic_init_u64(&shard_map->end_update_counter, 0);
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
}
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
}
|
||||
|
||||
static void
|
||||
psm_shmem_request(void)
|
||||
{
|
||||
#if PG_VERSION_NUM>=150000
|
||||
if (prev_shmem_request_hook)
|
||||
prev_shmem_request_hook();
|
||||
#endif
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
static PagestoreShmemState *pagestore_shared;
|
||||
static uint64 pagestore_local_counter = 0;
|
||||
static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
|
||||
static bool pageserver_flush(void);
|
||||
static void pageserver_disconnect(void);
|
||||
|
||||
static bool
|
||||
PagestoreShmemIsValid()
|
||||
{
|
||||
return pagestore_shared && UsedShmemSegAddr;
|
||||
}
|
||||
|
||||
static bool
|
||||
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
|
||||
{
|
||||
return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
|
||||
RequestAddinShmemSpace(sizeof(ShardMap));
|
||||
}
|
||||
|
||||
static void
|
||||
AssignPageserverConnstring(const char *newval, void *extra)
|
||||
psm_init(void)
|
||||
{
|
||||
if (!PagestoreShmemIsValid())
|
||||
return;
|
||||
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
|
||||
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
|
||||
pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
|
||||
LWLockRelease(pagestore_shared->lock);
|
||||
prev_shmem_startup_hook = shmem_startup_hook;
|
||||
shmem_startup_hook = psm_shmem_startup;
|
||||
#if PG_VERSION_NUM>=150000
|
||||
prev_shmem_request_hook = shmem_request_hook;
|
||||
shmem_request_hook = psm_shmem_request;
|
||||
#else
|
||||
psm_shmem_request();
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Reload shard map if needed and return number of shards and connection string for the specified shard
|
||||
* 'connstr' is an output buffer. If not NULL, it must point to a buffer at least MAX_PS_CONNSTR_LEN bytes
|
||||
* long. The connection string for the gven shard is copied to it.
|
||||
*/
|
||||
static shardno_t
|
||||
load_shard_map(shardno_t shard_no, char* connstr)
|
||||
{
|
||||
shardno_t n_shards;
|
||||
uint64 begin_update_counter;
|
||||
uint64 end_update_counter;
|
||||
|
||||
/*
|
||||
* There is race condition here between backend and postmaster which can update shard map.
|
||||
* We recheck update counter after copying shard map to check that configuration was not changed.
|
||||
*/
|
||||
do
|
||||
{
|
||||
begin_update_counter = pg_atomic_read_u64(&shard_map->begin_update_counter);
|
||||
end_update_counter = pg_atomic_read_u64(&shard_map->end_update_counter);
|
||||
|
||||
n_shards = shard_map->n_shards;
|
||||
if (shard_no >= n_shards)
|
||||
neon_log(ERROR, "Shard %d is greater or equal than number of shards %d", shard_no, n_shards);
|
||||
|
||||
if (connstr)
|
||||
{
|
||||
strlcpy(connstr, shard_map->shard_connstr[shard_no], MAX_PS_CONNSTR_LEN);
|
||||
pg_memory_barrier();
|
||||
}
|
||||
|
||||
}
|
||||
while (begin_update_counter != end_update_counter
|
||||
|| begin_update_counter != pg_atomic_read_u64(&shard_map->begin_update_counter)
|
||||
|| end_update_counter != pg_atomic_read_u64(&shard_map->end_update_counter));
|
||||
|
||||
|
||||
if (shard_map_update_counter != end_update_counter)
|
||||
{
|
||||
/* Reset all connections if connection strings are changed */
|
||||
for (shardno_t i = 0; i < MAX_SHARDS; i++)
|
||||
{
|
||||
if (page_servers[i].conn)
|
||||
pageserver_disconnect(i);
|
||||
}
|
||||
shard_map_update_counter = end_update_counter;
|
||||
}
|
||||
|
||||
return n_shards;
|
||||
}
|
||||
|
||||
#define MB (1024*1024)
|
||||
|
||||
shardno_t
|
||||
get_shard_number(BufferTag* tag)
|
||||
{
|
||||
shardno_t n_shards = load_shard_map(0, NULL);
|
||||
uint32 hash;
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
hash = murmurhash32(tag->rnode.relNode);
|
||||
hash = hash_combine(hash, murmurhash32(tag->blockNum/stripe_size));
|
||||
#else
|
||||
hash = murmurhash32(tag->relNumber);
|
||||
hash = hash_combine(hash, murmurhash32(tag->blockNum/stripe_size));
|
||||
#endif
|
||||
|
||||
return hash % n_shards;
|
||||
}
|
||||
|
||||
static bool
|
||||
CheckConnstringUpdated()
|
||||
{
|
||||
if (!PagestoreShmemIsValid())
|
||||
return false;
|
||||
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||
}
|
||||
|
||||
static void
|
||||
ReloadConnstring()
|
||||
{
|
||||
if (!PagestoreShmemIsValid())
|
||||
return;
|
||||
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
|
||||
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
|
||||
pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||
LWLockRelease(pagestore_shared->lock);
|
||||
}
|
||||
|
||||
static bool
|
||||
pageserver_connect(int elevel)
|
||||
pageserver_connect(shardno_t shard_no, int elevel)
|
||||
{
|
||||
char *query;
|
||||
int ret;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
PGconn* conn;
|
||||
WaitEventSet *wes;
|
||||
char connstr[MAX_PS_CONNSTR_LEN];
|
||||
|
||||
static TimestampTz last_connect_time = 0;
|
||||
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
TimestampTz now;
|
||||
uint64_t us_since_last_connect;
|
||||
|
||||
Assert(!connected);
|
||||
Assert(page_servers[shard_no].conn == NULL);
|
||||
|
||||
if (CheckConnstringUpdated())
|
||||
{
|
||||
ReloadConnstring();
|
||||
}
|
||||
(void)load_shard_map(shard_no, connstr); /* refresh shard map if needed */
|
||||
|
||||
now = GetCurrentTimestamp();
|
||||
us_since_last_connect = now - last_connect_time;
|
||||
@@ -180,76 +281,84 @@ pageserver_connect(int elevel)
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = local_pageserver_connstring;
|
||||
values[n] = connstr;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
n++;
|
||||
pageserver_conn = PQconnectdbParams(keywords, values, 1);
|
||||
conn = PQconnectdbParams(keywords, values, 1);
|
||||
|
||||
if (PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
if (PQstatus(conn) == CONNECTION_BAD)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
char *msg = pchomp(PQerrorMessage(conn));
|
||||
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
PQfinish(conn);
|
||||
|
||||
ereport(elevel,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "could not establish connection to pageserver"),
|
||||
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||
errdetail_internal("%s", msg)));
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
|
||||
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
ret = PQsendQuery(pageserver_conn, query);
|
||||
ret = PQsendQuery(conn, query);
|
||||
pfree(query);
|
||||
if (ret != 1)
|
||||
{
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
neon_log(elevel, "could not send pagestream command to pageserver");
|
||||
PQfinish(conn);
|
||||
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
|
||||
return false;
|
||||
}
|
||||
|
||||
pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
wes = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
|
||||
AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
|
||||
|
||||
while (PQisBusy(pageserver_conn))
|
||||
PG_TRY();
|
||||
{
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* Data available in socket? */
|
||||
if (event.events & WL_SOCKET_READABLE)
|
||||
while (PQisBusy(conn))
|
||||
{
|
||||
if (!PQconsumeInput(pageserver_conn))
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* Data available in socket? */
|
||||
if (event.events & WL_SOCKET_READABLE)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
if (!PQconsumeInput(conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(conn));
|
||||
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
FreeWaitEventSet(pageserver_conn_wes);
|
||||
pageserver_conn_wes = NULL;
|
||||
PQfinish(conn);
|
||||
FreeWaitEventSet(wes);
|
||||
|
||||
neon_log(elevel, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
return false;
|
||||
neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
PQfinish(conn);
|
||||
FreeWaitEventSet(wes);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
|
||||
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
|
||||
page_servers[shard_no].conn = conn;
|
||||
page_servers[shard_no].wes = wes;
|
||||
|
||||
connected = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -257,10 +366,10 @@ pageserver_connect(int elevel)
|
||||
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
|
||||
*/
|
||||
static int
|
||||
call_PQgetCopyData(char **buffer)
|
||||
call_PQgetCopyData(shardno_t shard_no, char **buffer)
|
||||
{
|
||||
int ret;
|
||||
|
||||
PGconn* pageserver_conn = page_servers[shard_no].conn;
|
||||
retry:
|
||||
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
|
||||
|
||||
@@ -269,7 +378,7 @@ retry:
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
@@ -281,7 +390,7 @@ retry:
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
neon_log(LOG, "could not get response from pageserver: %s", msg);
|
||||
neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg);
|
||||
pfree(msg);
|
||||
return -1;
|
||||
}
|
||||
@@ -295,7 +404,7 @@ retry:
|
||||
|
||||
|
||||
static void
|
||||
pageserver_disconnect(void)
|
||||
pageserver_disconnect(shardno_t shard_no)
|
||||
{
|
||||
/*
|
||||
* If anything goes wrong while we were sending a request, it's not clear
|
||||
@@ -304,38 +413,36 @@ pageserver_disconnect(void)
|
||||
* time later after we have already sent a new unrelated request. Close
|
||||
* the connection to avoid getting confused.
|
||||
*/
|
||||
if (connected)
|
||||
if (page_servers[shard_no].conn)
|
||||
{
|
||||
neon_log(LOG, "dropping connection to page server due to error");
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
connected = false;
|
||||
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
|
||||
PQfinish(page_servers[shard_no].conn);
|
||||
page_servers[shard_no].conn = NULL;
|
||||
|
||||
/*
|
||||
* If the connection to any pageserver is lost, we throw away the whole prefetch queue, even for other pageservers.
|
||||
* It should not cause big problems, because connection loss is supposed to be a rare event.
|
||||
*/
|
||||
prefetch_on_ps_disconnect();
|
||||
}
|
||||
if (pageserver_conn_wes != NULL)
|
||||
if (page_servers[shard_no].wes != NULL)
|
||||
{
|
||||
FreeWaitEventSet(pageserver_conn_wes);
|
||||
pageserver_conn_wes = NULL;
|
||||
FreeWaitEventSet(page_servers[shard_no].wes);
|
||||
page_servers[shard_no].wes = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
pageserver_send(NeonRequest *request)
|
||||
pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
{
|
||||
StringInfoData req_buff;
|
||||
|
||||
if (CheckConnstringUpdated())
|
||||
{
|
||||
pageserver_disconnect();
|
||||
ReloadConnstring();
|
||||
}
|
||||
PGconn* pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
{
|
||||
neon_log(LOG, "pageserver_send disconnect bad connection");
|
||||
pageserver_disconnect();
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
|
||||
pageserver_disconnect(shard_no);
|
||||
}
|
||||
|
||||
req_buff = nm_pack_request(request);
|
||||
@@ -349,9 +456,9 @@ pageserver_send(NeonRequest *request)
|
||||
* https://github.com/neondatabase/neon/issues/1138 So try to reestablish
|
||||
* connection in case of failure.
|
||||
*/
|
||||
if (!connected)
|
||||
if (!page_servers[shard_no].conn)
|
||||
{
|
||||
while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
||||
while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
||||
{
|
||||
HandleMainLoopInterrupts();
|
||||
n_reconnect_attempts += 1;
|
||||
@@ -359,7 +466,9 @@ pageserver_send(NeonRequest *request)
|
||||
n_reconnect_attempts = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
/*
|
||||
* Send request.
|
||||
*
|
||||
* In principle, this could block if the output buffer is full, and we
|
||||
@@ -370,9 +479,8 @@ pageserver_send(NeonRequest *request)
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
|
||||
pfree(msg);
|
||||
pfree(req_buff.data);
|
||||
return false;
|
||||
@@ -384,19 +492,19 @@ pageserver_send(NeonRequest *request)
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) request);
|
||||
|
||||
neon_log(PageStoreTrace, "sent request: %s", msg);
|
||||
neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static NeonResponse *
|
||||
pageserver_receive(void)
|
||||
pageserver_receive(shardno_t shard_no)
|
||||
{
|
||||
StringInfoData resp_buff;
|
||||
NeonResponse *resp;
|
||||
|
||||
if (!connected)
|
||||
PGconn* pageserver_conn = page_servers[shard_no].conn;
|
||||
if (!pageserver_conn)
|
||||
return NULL;
|
||||
|
||||
PG_TRY();
|
||||
@@ -404,7 +512,7 @@ pageserver_receive(void)
|
||||
/* read response */
|
||||
int rc;
|
||||
|
||||
rc = call_PQgetCopyData(&resp_buff.data);
|
||||
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
|
||||
if (rc >= 0)
|
||||
{
|
||||
resp_buff.len = rc;
|
||||
@@ -416,33 +524,33 @@ pageserver_receive(void)
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_log(PageStoreTrace, "got response: %s", msg);
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
else if (rc == -1)
|
||||
{
|
||||
neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
pageserver_disconnect();
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
pageserver_disconnect(shard_no);
|
||||
resp = NULL;
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
pageserver_disconnect();
|
||||
neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_log(LOG, "pageserver_receive disconnect due to caught exception");
|
||||
pageserver_disconnect();
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
@@ -452,11 +560,12 @@ pageserver_receive(void)
|
||||
|
||||
|
||||
static bool
|
||||
pageserver_flush(void)
|
||||
pageserver_flush(shardno_t shard_no)
|
||||
{
|
||||
if (!connected)
|
||||
PGconn* pageserver_conn = page_servers[shard_no].conn;
|
||||
if (!pageserver_conn)
|
||||
{
|
||||
neon_log(WARNING, "Tried to flush while disconnected");
|
||||
neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -464,8 +573,8 @@ pageserver_flush(void)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
@@ -488,63 +597,82 @@ check_neon_id(char **newval, void **extra, GucSource source)
|
||||
return **newval == '\0' || HexDecodeString(id, *newval, 16);
|
||||
}
|
||||
|
||||
static Size
|
||||
PagestoreShmemSize(void)
|
||||
{
|
||||
return sizeof(PagestoreShmemState);
|
||||
}
|
||||
|
||||
static bool
|
||||
PagestoreShmemInit(void)
|
||||
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
|
||||
{
|
||||
bool found;
|
||||
|
||||
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
|
||||
pagestore_shared = ShmemInitStruct("libpagestore shared state",
|
||||
PagestoreShmemSize(),
|
||||
&found);
|
||||
if (!found)
|
||||
const char* shard_connstr = *newval;
|
||||
const char* sep;
|
||||
size_t connstr_len;
|
||||
int i = 0;
|
||||
do
|
||||
{
|
||||
pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
|
||||
pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
sep = strchr(shard_connstr, ',');
|
||||
connstr_len = sep != NULL ? sep - shard_connstr : strlen(shard_connstr);
|
||||
if (connstr_len == 0 && sep == NULL)
|
||||
break; /* trailing comma */
|
||||
if (i >= MAX_SHARDS)
|
||||
{
|
||||
neon_log(LOG, "Too many shards");
|
||||
return false;
|
||||
}
|
||||
if (connstr_len >= MAX_PS_CONNSTR_LEN)
|
||||
{
|
||||
neon_log(LOG, "Connection string too long");
|
||||
return false;
|
||||
}
|
||||
shard_connstr = sep + 1;
|
||||
i += 1;
|
||||
} while (sep != NULL);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
AssignPageserverConnstring(const char *newval, void *extra)
|
||||
{
|
||||
/*
|
||||
* Load shard map only at Postmaster.
|
||||
* If old page server is not available, then backends can be blocked in attempts to reconnect to it and do not reload config in this loop
|
||||
*
|
||||
* Copying GUC value to shared memory is usually performed by postmaster.
|
||||
*/
|
||||
if (shard_map != NULL && UsedShmemSegAddr != NULL && MyProcPid == PostmasterPid)
|
||||
{
|
||||
const char* shard_connstr = newval;
|
||||
const char* sep;
|
||||
size_t connstr_len;
|
||||
int i = 0;
|
||||
bool shard_map_changed = false;
|
||||
do
|
||||
{
|
||||
sep = strchr(shard_connstr, ',');
|
||||
connstr_len = sep != NULL ? sep - shard_connstr : strlen(shard_connstr);
|
||||
if (connstr_len == 0)
|
||||
break; /* trailing comma */
|
||||
Assert(i < MAX_SHARDS);
|
||||
Assert(connstr_len < MAX_PS_CONNSTR_LEN);
|
||||
if (i >= shard_map->n_shards ||
|
||||
strcmp(shard_map->shard_connstr[i], shard_connstr) != 0)
|
||||
{
|
||||
if (!shard_map_changed)
|
||||
{
|
||||
pg_atomic_add_fetch_u64(&shard_map->begin_update_counter, 1);
|
||||
shard_map_changed = true;
|
||||
}
|
||||
memcpy(shard_map->shard_connstr[i], shard_connstr, connstr_len);
|
||||
shard_map->shard_connstr[i][connstr_len] = '\0';
|
||||
}
|
||||
shard_connstr = sep + 1;
|
||||
i += 1;
|
||||
} while (sep != NULL);
|
||||
|
||||
if (shard_map_changed)
|
||||
{
|
||||
shard_map->n_shards = i;
|
||||
pg_memory_barrier();
|
||||
pg_atomic_add_fetch_u64(&shard_map->end_update_counter, 1);
|
||||
}
|
||||
}
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
return found;
|
||||
}
|
||||
|
||||
static void
|
||||
pagestore_shmem_startup_hook(void)
|
||||
{
|
||||
if (prev_shmem_startup_hook)
|
||||
prev_shmem_startup_hook();
|
||||
|
||||
PagestoreShmemInit();
|
||||
}
|
||||
|
||||
static void
|
||||
pagestore_shmem_request(void)
|
||||
{
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
if (prev_shmem_request_hook)
|
||||
prev_shmem_request_hook();
|
||||
#endif
|
||||
|
||||
RequestAddinShmemSpace(PagestoreShmemSize());
|
||||
RequestNamedLWLockTranche("neon_libpagestore", 1);
|
||||
}
|
||||
|
||||
static void
|
||||
pagestore_prepare_shmem(void)
|
||||
{
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
prev_shmem_request_hook = shmem_request_hook;
|
||||
shmem_request_hook = pagestore_shmem_request;
|
||||
#else
|
||||
pagestore_shmem_request();
|
||||
#endif
|
||||
prev_shmem_startup_hook = shmem_startup_hook;
|
||||
shmem_startup_hook = pagestore_shmem_startup_hook;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -553,8 +681,6 @@ pagestore_prepare_shmem(void)
|
||||
void
|
||||
pg_init_libpagestore(void)
|
||||
{
|
||||
pagestore_prepare_shmem();
|
||||
|
||||
DefineCustomStringVariable("neon.pageserver_connstring",
|
||||
"connection string to the page server",
|
||||
NULL,
|
||||
@@ -582,6 +708,15 @@ pg_init_libpagestore(void)
|
||||
0, /* no flags required */
|
||||
check_neon_id, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.stripe_size",
|
||||
"sharding stripe size",
|
||||
NULL,
|
||||
&stripe_size,
|
||||
32768, 1, INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
GUC_UNIT_BLOCKS,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.max_cluster_size",
|
||||
"cluster size limit",
|
||||
NULL,
|
||||
@@ -645,4 +780,5 @@ pg_init_libpagestore(void)
|
||||
}
|
||||
|
||||
lfc_init();
|
||||
psm_init();
|
||||
}
|
||||
|
||||
@@ -17,12 +17,20 @@
|
||||
|
||||
#include "access/xlogdefs.h"
|
||||
#include RELFILEINFO_HDR
|
||||
#include "storage/block.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "lib/stringinfo.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "storage/block.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "utils/memutils.h"
|
||||
|
||||
#include "pg_config.h"
|
||||
|
||||
#define MAX_SHARDS 128
|
||||
#define MAX_PS_CONNSTR_LEN 128
|
||||
|
||||
typedef enum
|
||||
{
|
||||
/* pagestore_client -> pagestore */
|
||||
@@ -51,6 +59,9 @@ typedef struct
|
||||
#define neon_log(tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
|
||||
#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
|
||||
|
||||
/*
|
||||
* supertype of all the Neon*Request structs below
|
||||
@@ -141,11 +152,13 @@ extern char *nm_to_string(NeonMessage *msg);
|
||||
* API
|
||||
*/
|
||||
|
||||
typedef unsigned shardno_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
bool (*send) (NeonRequest *request);
|
||||
NeonResponse *(*receive) (void);
|
||||
bool (*flush) (void);
|
||||
bool (*send) (shardno_t shard_no, NeonRequest * request);
|
||||
NeonResponse *(*receive) (shardno_t shard_no);
|
||||
bool (*flush) (shardno_t shard_no);
|
||||
} page_server_api;
|
||||
|
||||
extern void prefetch_on_ps_disconnect(void);
|
||||
@@ -159,6 +172,8 @@ extern char *neon_timeline;
|
||||
extern char *neon_tenant;
|
||||
extern int32 max_cluster_size;
|
||||
|
||||
extern shardno_t get_shard_number(BufferTag* tag);
|
||||
|
||||
extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
|
||||
extern void smgr_init_neon(void);
|
||||
extern void readahead_buffer_resize(int newsize, void *extra);
|
||||
|
||||
@@ -172,6 +172,7 @@ typedef struct PrefetchRequest
|
||||
XLogRecPtr actual_request_lsn;
|
||||
NeonResponse *response; /* may be null */
|
||||
PrefetchStatus status;
|
||||
shardno_t shard_no;
|
||||
uint64 my_ring_index;
|
||||
} PrefetchRequest;
|
||||
|
||||
@@ -239,7 +240,9 @@ typedef struct PrefetchState
|
||||
* also unused */
|
||||
|
||||
/* the buffers */
|
||||
prfh_hash *prf_hash;
|
||||
prfh_hash *prf_hash;
|
||||
int max_shard_no;
|
||||
uint8 shard_bitmap[(MAX_SHARDS + 7)/8];
|
||||
PrefetchRequest prf_buffer[]; /* prefetch buffers */
|
||||
} PrefetchState;
|
||||
|
||||
@@ -327,6 +330,7 @@ compact_prefetch_buffers(void)
|
||||
Assert(target_slot->status == PRFS_UNUSED);
|
||||
|
||||
target_slot->buftag = source_slot->buftag;
|
||||
target_slot->shard_no = source_slot->shard_no;
|
||||
target_slot->status = source_slot->status;
|
||||
target_slot->response = source_slot->response;
|
||||
target_slot->effective_request_lsn = source_slot->effective_request_lsn;
|
||||
@@ -494,6 +498,23 @@ prefetch_cleanup_trailing_unused(void)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
prefetch_flush_requests(void)
|
||||
{
|
||||
for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
|
||||
{
|
||||
if (MyPState->shard_bitmap[shard_no >> 3] & (1 << (shard_no & 7)))
|
||||
{
|
||||
if (!page_server->flush(shard_no))
|
||||
return false;
|
||||
MyPState->shard_bitmap[shard_no >> 3] &= ~(1 << (shard_no & 7));
|
||||
}
|
||||
}
|
||||
MyPState->max_shard_no = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for slot of ring_index to have received its response.
|
||||
* The caller is responsible for making sure the request buffer is flushed.
|
||||
@@ -509,7 +530,7 @@ prefetch_wait_for(uint64 ring_index)
|
||||
if (MyPState->ring_flush <= ring_index &&
|
||||
MyPState->ring_unused > MyPState->ring_flush)
|
||||
{
|
||||
if (!page_server->flush())
|
||||
if (!prefetch_flush_requests())
|
||||
return false;
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
}
|
||||
@@ -547,7 +568,7 @@ prefetch_read(PrefetchRequest *slot)
|
||||
Assert(slot->my_ring_index == MyPState->ring_receive);
|
||||
|
||||
old = MemoryContextSwitchTo(MyPState->errctx);
|
||||
response = (NeonResponse *) page_server->receive();
|
||||
response = (NeonResponse *) page_server->receive(slot->shard_no);
|
||||
MemoryContextSwitchTo(old);
|
||||
if (response)
|
||||
{
|
||||
@@ -704,12 +725,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
||||
Assert(slot->response == NULL);
|
||||
Assert(slot->my_ring_index == MyPState->ring_unused);
|
||||
|
||||
while (!page_server->send((NeonRequest *) &request));
|
||||
while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_requests_inflight += 1;
|
||||
MyPState->n_unused -= 1;
|
||||
MyPState->ring_unused += 1;
|
||||
MyPState->shard_bitmap[slot->shard_no >> 3] |= 1 << (slot->shard_no & 7);
|
||||
MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);
|
||||
|
||||
/* update slot state */
|
||||
slot->status = PRFS_REQUESTED;
|
||||
@@ -880,6 +903,7 @@ Retry:
|
||||
* function reads the buffer tag from the slot.
|
||||
*/
|
||||
slot->buftag = tag;
|
||||
slot->shard_no = get_shard_number(&tag);
|
||||
slot->my_ring_index = ring_index;
|
||||
|
||||
prefetch_do_request(slot, force_latest, force_lsn);
|
||||
@@ -890,7 +914,7 @@ Retry:
|
||||
if (flush_every_n_requests > 0 &&
|
||||
MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
|
||||
{
|
||||
if (!page_server->flush())
|
||||
if (!prefetch_flush_requests())
|
||||
{
|
||||
/*
|
||||
* Prefetch set is reset in case of error, so we should try to
|
||||
@@ -908,13 +932,44 @@ static NeonResponse *
|
||||
page_server_request(void const *req)
|
||||
{
|
||||
NeonResponse *resp;
|
||||
BufferTag tag = {0};
|
||||
shardno_t shard_no;
|
||||
|
||||
switch (((NeonRequest *) req)->tag)
|
||||
{
|
||||
case T_NeonExistsRequest:
|
||||
CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
|
||||
break;
|
||||
case T_NeonNblocksRequest:
|
||||
CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
|
||||
break;
|
||||
case T_NeonDbSizeRequest:
|
||||
NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
|
||||
break;
|
||||
case T_NeonGetPageRequest:
|
||||
CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
|
||||
tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
|
||||
break;
|
||||
default:
|
||||
neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
|
||||
}
|
||||
shard_no = get_shard_number(&tag);
|
||||
|
||||
|
||||
/*
|
||||
* Current sharding model assumes that all metadata is present only at shard 0.
|
||||
* We still need to call get_shard_no() to check if shard map is up-to-date.
|
||||
*/
|
||||
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
|
||||
{
|
||||
shard_no = 0;
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
while (!page_server->send((NeonRequest *) req) || !page_server->flush());
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
|
||||
consume_prefetch_responses();
|
||||
resp = page_server->receive();
|
||||
resp = page_server->receive(shard_no);
|
||||
} while (resp == NULL);
|
||||
return resp;
|
||||
|
||||
@@ -990,7 +1045,7 @@ nm_pack_request(NeonRequest *msg)
|
||||
case T_NeonErrorResponse:
|
||||
case T_NeonDbSizeResponse:
|
||||
default:
|
||||
elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
|
||||
neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
|
||||
break;
|
||||
}
|
||||
return s;
|
||||
@@ -1085,7 +1140,7 @@ nm_unpack_response(StringInfo s)
|
||||
case T_NeonGetPageRequest:
|
||||
case T_NeonDbSizeRequest:
|
||||
default:
|
||||
elog(ERROR, "unexpected neon message tag 0x%02x", tag);
|
||||
neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1277,7 +1332,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
XLogFlush(recptr);
|
||||
lsn = recptr;
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
@@ -1305,7 +1360,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
if (PageIsNew((Page) buffer))
|
||||
{
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum)));
|
||||
@@ -1313,7 +1368,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
else if (PageIsEmptyHeapPage((Page) buffer))
|
||||
{
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum)));
|
||||
@@ -1321,7 +1376,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
else
|
||||
{
|
||||
ereport(PANIC,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum)));
|
||||
@@ -1330,7 +1385,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
|
||||
else
|
||||
{
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
|
||||
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
|
||||
blocknum,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
@@ -1430,7 +1485,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
|
||||
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
|
||||
elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
|
||||
neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
|
||||
(uint32) ((lsn) >> 32), (uint32) (lsn));
|
||||
}
|
||||
else
|
||||
@@ -1445,7 +1500,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
|
||||
*latest = true;
|
||||
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
|
||||
Assert(lsn != InvalidXLogRecPtr);
|
||||
elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
|
||||
neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
|
||||
(uint32) ((lsn) >> 32), (uint32) (lsn));
|
||||
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
@@ -1465,7 +1520,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
|
||||
#endif
|
||||
if (lsn > flushlsn)
|
||||
{
|
||||
elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
|
||||
neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
|
||||
(uint32) (lsn >> 32), (uint32) lsn,
|
||||
(uint32) (flushlsn >> 32), (uint32) flushlsn);
|
||||
XLogFlush(lsn);
|
||||
@@ -1509,7 +1564,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
return mdexists(reln, forkNum);
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
|
||||
@@ -1561,7 +1616,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
@@ -1570,7 +1625,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
break;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
return exists;
|
||||
@@ -1587,7 +1642,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
break;
|
||||
@@ -1598,10 +1653,10 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
elog(SmgrTrace, "Create relation %u/%u/%u.%u",
|
||||
neon_log(SmgrTrace, "Create relation %u/%u/%u.%u",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum);
|
||||
|
||||
@@ -1696,7 +1751,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
break;
|
||||
@@ -1707,7 +1762,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1726,7 +1781,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DISK_FULL),
|
||||
errmsg("could not extend file because project size limit (%d MB) has been exceeded",
|
||||
errmsg(NEON_TAG "could not extend file because project size limit (%d MB) has been exceeded",
|
||||
max_cluster_size),
|
||||
errhint("This limit is defined externally by the project size limit, and internally by neon.max_cluster_size GUC")));
|
||||
}
|
||||
@@ -1745,7 +1800,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
|
||||
|
||||
lsn = PageGetLSN((Page) buffer);
|
||||
elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||
neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum, blkno,
|
||||
(uint32) (lsn >> 32), (uint32) lsn);
|
||||
@@ -1785,7 +1840,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
break;
|
||||
@@ -1796,7 +1851,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (max_cluster_size > 0 &&
|
||||
@@ -1808,7 +1863,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DISK_FULL),
|
||||
errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
|
||||
errmsg(NEON_TAG "could not extend file because cluster size limit (%d MB) has been exceeded",
|
||||
max_cluster_size),
|
||||
errhint("This limit is defined by neon.max_cluster_size GUC")));
|
||||
}
|
||||
@@ -1821,7 +1876,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
||||
errmsg("cannot extend file \"%s\" beyond %u blocks",
|
||||
errmsg(NEON_TAG "cannot extend file \"%s\" beyond %u blocks",
|
||||
relpath(reln->smgr_rlocator, forkNum),
|
||||
InvalidBlockNumber)));
|
||||
|
||||
@@ -1882,7 +1937,7 @@ neon_open(SMgrRelation reln)
|
||||
mdopen(reln);
|
||||
|
||||
/* no work */
|
||||
elog(SmgrTrace, "[NEON_SMGR] open noop");
|
||||
neon_log(SmgrTrace, "open noop");
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1919,7 +1974,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
return mdprefetch(reln, forknum, blocknum);
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
|
||||
@@ -1964,11 +2019,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
/* not implemented */
|
||||
elog(SmgrTrace, "[NEON_SMGR] writeback noop");
|
||||
neon_log(SmgrTrace, "writeback noop");
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -2098,8 +2153,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
blkno,
|
||||
errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
slot->shard_no, blkno,
|
||||
RelFileInfoFmt(rinfo),
|
||||
forkNum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
@@ -2107,7 +2162,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
|
||||
/* buffer was used, clean up for later reuse */
|
||||
@@ -2131,7 +2186,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
break;
|
||||
@@ -2142,7 +2197,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
/* Try to read from local file cache */
|
||||
@@ -2170,7 +2225,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
{
|
||||
if (!PageIsNew((Page) pageserver_masked))
|
||||
{
|
||||
elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||
neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||
blkno,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
@@ -2180,7 +2235,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
}
|
||||
else if (PageIsNew((Page) buffer))
|
||||
{
|
||||
elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||
neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||
blkno,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
@@ -2195,7 +2250,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
|
||||
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
|
||||
{
|
||||
elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
blkno,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
@@ -2214,7 +2269,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
|
||||
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
|
||||
{
|
||||
elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
blkno,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
@@ -2294,13 +2349,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
neon_wallog_page(reln, forknum, blocknum, buffer, false);
|
||||
|
||||
lsn = PageGetLSN((Page) buffer);
|
||||
elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||
neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, blocknum,
|
||||
(uint32) (lsn >> 32), (uint32) lsn);
|
||||
@@ -2327,7 +2382,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
@@ -2338,12 +2393,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
return mdnblocks(reln, forknum);
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
|
||||
{
|
||||
elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
|
||||
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, n_blocks);
|
||||
return n_blocks;
|
||||
@@ -2371,7 +2426,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
@@ -2380,11 +2435,11 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
break;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
||||
|
||||
elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
|
||||
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||
@@ -2427,7 +2482,7 @@ neon_dbsize(Oid dbNode)
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read db size of db %u from page server at lsn %X/%08X",
|
||||
errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
|
||||
dbNode,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
errdetail("page server returned error: %s",
|
||||
@@ -2435,10 +2490,10 @@ neon_dbsize(Oid dbNode)
|
||||
break;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
|
||||
elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
dbNode,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||
db_size);
|
||||
@@ -2458,7 +2513,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
@@ -2470,7 +2525,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
|
||||
@@ -2526,7 +2581,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
@@ -2538,10 +2593,10 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
elog(SmgrTrace, "[NEON_SMGR] immedsync noop");
|
||||
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -2566,17 +2621,17 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
* progress at a time. That's enough for the current usage.
|
||||
*/
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
|
||||
elog(ERROR, "unlogged relation build is already in progress");
|
||||
neon_log(ERROR, "unlogged relation build is already in progress");
|
||||
Assert(unlogged_build_rel == NULL);
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("starting unlogged build of relation %u/%u/%u",
|
||||
(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
|
||||
neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
@@ -2589,11 +2644,11 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
return;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
|
||||
elog(ERROR, "cannot perform unlogged index build, index is not empty ");
|
||||
neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
|
||||
|
||||
unlogged_build_rel = reln;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
|
||||
@@ -2620,7 +2675,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
|
||||
Assert(unlogged_build_rel == reln);
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
|
||||
(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
|
||||
|
||||
if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
|
||||
@@ -2649,7 +2704,7 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
Assert(unlogged_build_rel == reln);
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("ending unlogged build of relation %u/%u/%u",
|
||||
(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
|
||||
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
|
||||
@@ -2664,7 +2719,7 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
rinfob = InfoBFromSMgrRel(reln);
|
||||
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||
{
|
||||
elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
|
||||
neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
|
||||
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
|
||||
forknum);
|
||||
|
||||
@@ -2707,7 +2762,7 @@ AtEOXact_neon(XactEvent event, void *arg)
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INTERNAL_ERROR),
|
||||
(errmsg("unlogged index build was not properly finished"))));
|
||||
(errmsg(NEON_TAG "unlogged index build was not properly finished"))));
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -2806,14 +2861,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
set_cached_relsize(rinfo, forknum, relsize);
|
||||
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
|
||||
|
||||
elog(SmgrTrace, "Set length to %d", relsize);
|
||||
neon_log(SmgrTrace, "Set length to %d", relsize);
|
||||
}
|
||||
}
|
||||
|
||||
#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
|
||||
|
||||
/*
|
||||
* TODO: May be it is better to make correspondent fgunctio from freespace.c public?
|
||||
* TODO: May be it is better to make correspondent function from freespace.c public?
|
||||
*/
|
||||
static BlockNumber
|
||||
get_fsm_physical_block(BlockNumber heapblk)
|
||||
@@ -2894,7 +2949,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
|
||||
#if PG_VERSION_NUM < 150000
|
||||
if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno))
|
||||
elog(PANIC, "failed to locate backup block with ID %d", block_id);
|
||||
neon_log(PANIC, "failed to locate backup block with ID %d", block_id);
|
||||
#else
|
||||
XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
|
||||
#endif
|
||||
|
||||
@@ -16,6 +16,7 @@ class Metrics:
|
||||
def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]:
|
||||
filter = filter or {}
|
||||
res = []
|
||||
|
||||
for sample in self.metrics[name]:
|
||||
try:
|
||||
if all(sample.labels[k] == v for k, v in filter.items()):
|
||||
|
||||
@@ -60,7 +60,7 @@ from fixtures.remote_storage import (
|
||||
default_remote_storage,
|
||||
remote_storage_to_toml_inline_table,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.utils import (
|
||||
ATTACHMENT_NAME_REGEX,
|
||||
allure_add_grafana_links,
|
||||
@@ -490,6 +490,8 @@ class NeonEnvBuilder:
|
||||
self,
|
||||
initial_tenant_conf: Optional[Dict[str, str]] = None,
|
||||
default_remote_storage_if_missing: bool = True,
|
||||
initial_tenant_shard_count: Optional[int] = None,
|
||||
initial_tenant_shard_stripe_size: Optional[int] = None,
|
||||
) -> NeonEnv:
|
||||
"""
|
||||
Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline.
|
||||
@@ -507,7 +509,11 @@ class NeonEnvBuilder:
|
||||
f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
|
||||
)
|
||||
initial_tenant, initial_timeline = env.neon_cli.create_tenant(
|
||||
tenant_id=env.initial_tenant, conf=initial_tenant_conf, timeline_id=env.initial_timeline
|
||||
tenant_id=env.initial_tenant,
|
||||
conf=initial_tenant_conf,
|
||||
timeline_id=env.initial_timeline,
|
||||
shard_count=initial_tenant_shard_count,
|
||||
shard_stripe_size=initial_tenant_shard_stripe_size,
|
||||
)
|
||||
assert env.initial_tenant == initial_tenant
|
||||
assert env.initial_timeline == initial_timeline
|
||||
@@ -1130,15 +1136,29 @@ class AbstractNeonCli(abc.ABC):
|
||||
env_vars[var] = val
|
||||
|
||||
# Intercept CalledProcessError and print more info
|
||||
res = subprocess.run(
|
||||
args,
|
||||
env=env_vars,
|
||||
check=False,
|
||||
universal_newlines=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
timeout=timeout,
|
||||
)
|
||||
try:
|
||||
res = subprocess.run(
|
||||
args,
|
||||
env=env_vars,
|
||||
check=False,
|
||||
universal_newlines=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
timeout=timeout,
|
||||
)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
if e.stderr:
|
||||
stderr = e.stderr.decode(errors="replace")
|
||||
else:
|
||||
stderr = ""
|
||||
|
||||
if e.stdout:
|
||||
stdout = e.stdout.decode(errors="replace")
|
||||
else:
|
||||
stdout = ""
|
||||
|
||||
log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}")
|
||||
raise
|
||||
|
||||
indent = " "
|
||||
if not res.returncode:
|
||||
@@ -1189,6 +1209,8 @@ class NeonCli(AbstractNeonCli):
|
||||
tenant_id: Optional[TenantId] = None,
|
||||
timeline_id: Optional[TimelineId] = None,
|
||||
conf: Optional[Dict[str, str]] = None,
|
||||
shard_count: Optional[int] = None,
|
||||
shard_stripe_size: Optional[int] = None,
|
||||
set_default: bool = False,
|
||||
) -> Tuple[TenantId, TimelineId]:
|
||||
"""
|
||||
@@ -1216,6 +1238,12 @@ class NeonCli(AbstractNeonCli):
|
||||
if set_default:
|
||||
args.append("--set-default")
|
||||
|
||||
if shard_count is not None:
|
||||
args.extend(["--shard-count", str(shard_count)])
|
||||
|
||||
if shard_stripe_size is not None:
|
||||
args.extend(["--shard-stripe-size", str(shard_stripe_size)])
|
||||
|
||||
res = self.raw_cli(args)
|
||||
res.check_returncode()
|
||||
return tenant_id, timeline_id
|
||||
@@ -1536,6 +1564,19 @@ class NeonCli(AbstractNeonCli):
|
||||
|
||||
return self.raw_cli(args, check_return_code=True)
|
||||
|
||||
def tenant_migrate(
|
||||
self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int]
|
||||
):
|
||||
args = [
|
||||
"tenant",
|
||||
"migrate",
|
||||
"--tenant-id",
|
||||
str(tenant_shard_id),
|
||||
"--id",
|
||||
str(new_pageserver),
|
||||
]
|
||||
return self.raw_cli(args, check_return_code=True, timeout=timeout_secs)
|
||||
|
||||
def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
|
||||
return self.raw_cli(["start"], check_return_code=check_return_code)
|
||||
|
||||
@@ -1631,6 +1672,66 @@ class NeonAttachmentService:
|
||||
else:
|
||||
return None
|
||||
|
||||
def node_register(self, node: NeonPageserver):
|
||||
body = {
|
||||
"node_id": int(node.id),
|
||||
"listen_http_addr": "localhost",
|
||||
"listen_http_port": node.service_port.http,
|
||||
}
|
||||
log.info(f"node_register({body})")
|
||||
requests.post(f"{self.env.control_plane_api}/node", json=body).raise_for_status()
|
||||
|
||||
def tenant_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
shard_count: Optional[int] = None,
|
||||
shard_stripe_size: Optional[int] = None,
|
||||
tenant_config: Optional[Dict[Any, Any]] = None,
|
||||
):
|
||||
body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
|
||||
|
||||
if shard_count is not None:
|
||||
shard_params = {"count": shard_count}
|
||||
if shard_stripe_size is not None:
|
||||
shard_params["stripe_size"] = shard_stripe_size
|
||||
|
||||
body["shard_parameters"] = shard_params
|
||||
|
||||
if tenant_config is not None:
|
||||
for k, v in tenant_config.items():
|
||||
body[k] = v
|
||||
|
||||
response = requests.post(f"{self.env.control_plane_api}/tenant", json=body)
|
||||
response.raise_for_status()
|
||||
log.info(f"tenant_create success: {response.json()}")
|
||||
|
||||
def tenant_timeline_create(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
body: Dict[str, Any] = {"new_timeline_id": str(timeline_id)}
|
||||
|
||||
response = requests.post(
|
||||
f"{self.env.control_plane_api}/tenant/{tenant_id}/timeline", json=body
|
||||
)
|
||||
response.raise_for_status()
|
||||
log.info(f"tenant_timeline_create success: {response.json()}")
|
||||
|
||||
def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
|
||||
response = requests.get(f"{self.env.control_plane_api}/tenant/{tenant_id}/locate")
|
||||
response.raise_for_status()
|
||||
body = response.json()
|
||||
shards: list[dict[str, Any]] = body["shards"]
|
||||
return shards
|
||||
|
||||
def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
|
||||
response = requests.put(
|
||||
f"{self.env.control_plane_api}/tenant/{tenant_id}/shard_split",
|
||||
json={"new_shard_count": shard_count},
|
||||
)
|
||||
response.raise_for_status()
|
||||
body = response.json()
|
||||
log.info(f"tenant_shard_split success: {body}")
|
||||
shards: list[TenantShardId] = body["new_shards"]
|
||||
return shards
|
||||
|
||||
def __enter__(self) -> "NeonAttachmentService":
|
||||
return self
|
||||
|
||||
@@ -3201,7 +3302,7 @@ def pytest_addoption(parser: Parser):
|
||||
|
||||
|
||||
SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
|
||||
r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql)"
|
||||
r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
|
||||
)
|
||||
|
||||
|
||||
@@ -3297,9 +3398,7 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
|
||||
|
||||
|
||||
# pg is the existing and running compute node, that we want to compare with a basebackup
|
||||
def check_restored_datadir_content(
|
||||
test_output_dir: Path, env: NeonEnv, endpoint: Endpoint, pageserver_id: Optional[int] = None
|
||||
):
|
||||
def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
|
||||
# Get the timeline ID. We need it for the 'basebackup' command
|
||||
timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
|
||||
|
||||
@@ -3320,6 +3419,7 @@ def check_restored_datadir_content(
|
||||
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||
psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
|
||||
|
||||
pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
|
||||
cmd = rf"""
|
||||
{psql_path} \
|
||||
--no-psqlrc \
|
||||
@@ -3388,6 +3488,27 @@ def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -
|
||||
time.sleep(0.5)
|
||||
|
||||
|
||||
def tenant_get_shards(
|
||||
env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None
|
||||
) -> list[tuple[TenantShardId, NeonPageserver]]:
|
||||
"""
|
||||
Helper for when you want to talk to one or more pageservers, and the
|
||||
caller _might_ have specified a pageserver, or they might leave it to
|
||||
us to figure out the shards for a tenant.
|
||||
|
||||
Caller should over the response to apply their per-pageserver action to
|
||||
each shard
|
||||
"""
|
||||
if len(env.pageservers) > 1:
|
||||
return [
|
||||
(TenantShardId.parse(s["shard_id"]), env.get_pageserver(s["node_id"]))
|
||||
for s in env.attachment_service.locate(tenant_id)
|
||||
]
|
||||
else:
|
||||
# Assume an unsharded tenant
|
||||
return [(TenantShardId(tenant_id, 0, 0), env.pageserver)]
|
||||
|
||||
|
||||
def wait_for_last_flush_lsn(
|
||||
env: NeonEnv,
|
||||
endpoint: Endpoint,
|
||||
@@ -3397,10 +3518,22 @@ def wait_for_last_flush_lsn(
|
||||
) -> Lsn:
|
||||
"""Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
|
||||
|
||||
shards = tenant_get_shards(env, tenant, pageserver_id)
|
||||
|
||||
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
|
||||
return wait_for_last_record_lsn(
|
||||
env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn
|
||||
)
|
||||
|
||||
results = []
|
||||
for tenant_shard_id, pageserver in shards:
|
||||
log.info(f"wait_for_last_flush_lsn: shard {tenant_shard_id}")
|
||||
waited = wait_for_last_record_lsn(
|
||||
pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
|
||||
)
|
||||
|
||||
assert waited >= last_flush_lsn
|
||||
results.append(waited)
|
||||
|
||||
# Return the lowest LSN that has been ingested by all shards
|
||||
return min(results)
|
||||
|
||||
|
||||
def wait_for_wal_insert_lsn(
|
||||
@@ -3412,9 +3545,16 @@ def wait_for_wal_insert_lsn(
|
||||
) -> Lsn:
|
||||
"""Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
|
||||
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
|
||||
return wait_for_last_record_lsn(
|
||||
env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn
|
||||
)
|
||||
result = None
|
||||
for tenant_shard_id, pageserver in tenant_get_shards(env, tenant, pageserver_id):
|
||||
shard_r = wait_for_last_record_lsn(
|
||||
pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
|
||||
)
|
||||
if result is None:
|
||||
result = shard_r
|
||||
|
||||
assert result is not None
|
||||
return result
|
||||
|
||||
|
||||
def fork_at_current_lsn(
|
||||
@@ -3448,11 +3588,13 @@ def last_flush_lsn_upload(
|
||||
last_flush_lsn = wait_for_last_flush_lsn(
|
||||
env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver_id
|
||||
)
|
||||
ps_http = env.get_pageserver(pageserver_id).http_client()
|
||||
wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn)
|
||||
# force a checkpoint to trigger upload
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
|
||||
shards = tenant_get_shards(env, tenant_id, pageserver_id)
|
||||
for tenant_shard_id, pageserver in shards:
|
||||
ps_http = pageserver.http_client()
|
||||
wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
|
||||
# force a checkpoint to trigger upload
|
||||
ps_http.timeline_checkpoint(tenant_shard_id, timeline_id)
|
||||
wait_for_upload(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
|
||||
return last_flush_lsn
|
||||
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ from urllib3.util.retry import Retry
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, parse_metrics
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.utils import Fn
|
||||
|
||||
|
||||
@@ -211,7 +211,7 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
def tenant_create(
|
||||
self,
|
||||
new_tenant_id: TenantId,
|
||||
new_tenant_id: TenantId | TenantShardId,
|
||||
conf: Optional[Dict[str, Any]] = None,
|
||||
generation: Optional[int] = None,
|
||||
) -> TenantId:
|
||||
@@ -239,7 +239,7 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
def tenant_attach(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
config: None | Dict[str, Any] = None,
|
||||
config_null: bool = False,
|
||||
generation: Optional[int] = None,
|
||||
@@ -269,7 +269,7 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_reset(self, tenant_id: TenantId, drop_cache: bool):
|
||||
def tenant_reset(self, tenant_id: TenantId | TenantShardId, drop_cache: bool):
|
||||
params = {}
|
||||
if drop_cache:
|
||||
params["drop_cache"] = "true"
|
||||
@@ -278,7 +278,7 @@ class PageserverHttpClient(requests.Session):
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_location_conf(
|
||||
self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None
|
||||
self, tenant_id: TenantId | TenantShardId, location_conf=dict[str, Any], flush_ms=None
|
||||
):
|
||||
body = location_conf.copy()
|
||||
body["tenant_id"] = str(tenant_id)
|
||||
@@ -294,7 +294,7 @@ class PageserverHttpClient(requests.Session):
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_delete(self, tenant_id: TenantId):
|
||||
def tenant_delete(self, tenant_id: TenantId | TenantShardId):
|
||||
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
self.verbose_error(res)
|
||||
return res
|
||||
@@ -310,23 +310,23 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
|
||||
def tenant_status(self, tenant_id: TenantId | TenantShardId) -> Dict[Any, Any]:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def tenant_config(self, tenant_id: TenantId) -> TenantConfig:
|
||||
def tenant_config(self, tenant_id: TenantId | TenantShardId) -> TenantConfig:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config")
|
||||
self.verbose_error(res)
|
||||
return TenantConfig.from_json(res.json())
|
||||
|
||||
def tenant_heatmap_upload(self, tenant_id: TenantId):
|
||||
def tenant_heatmap_upload(self, tenant_id: TenantId | TenantShardId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
|
||||
self.verbose_error(res)
|
||||
|
||||
def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
|
||||
def set_tenant_config(self, tenant_id: TenantId | TenantShardId, config: dict[str, Any]):
|
||||
assert "tenant_id" not in config.keys()
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/config",
|
||||
@@ -348,10 +348,12 @@ class PageserverHttpClient(requests.Session):
|
||||
del current[key]
|
||||
self.set_tenant_config(tenant_id, current)
|
||||
|
||||
def tenant_size(self, tenant_id: TenantId) -> int:
|
||||
def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int:
|
||||
return self.tenant_size_and_modelinputs(tenant_id)[0]
|
||||
|
||||
def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
|
||||
def tenant_size_and_modelinputs(
|
||||
self, tenant_id: TenantId | TenantShardId
|
||||
) -> Tuple[int, Dict[str, Any]]:
|
||||
"""
|
||||
Returns the tenant size, together with the model inputs as the second tuple item.
|
||||
"""
|
||||
@@ -366,7 +368,7 @@ class PageserverHttpClient(requests.Session):
|
||||
assert type(inputs) is dict
|
||||
return (size, inputs)
|
||||
|
||||
def tenant_size_debug(self, tenant_id: TenantId) -> str:
|
||||
def tenant_size_debug(self, tenant_id: TenantId | TenantShardId) -> str:
|
||||
"""
|
||||
Returns the tenant size debug info, as an HTML string
|
||||
"""
|
||||
@@ -378,7 +380,7 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
def timeline_list(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
include_non_incremental_logical_size: bool = False,
|
||||
include_timeline_dir_layer_file_size_sum: bool = False,
|
||||
) -> List[Dict[str, Any]]:
|
||||
@@ -399,7 +401,7 @@ class PageserverHttpClient(requests.Session):
|
||||
def timeline_create(
|
||||
self,
|
||||
pg_version: PgVersion,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
new_timeline_id: TimelineId,
|
||||
ancestor_timeline_id: Optional[TimelineId] = None,
|
||||
ancestor_start_lsn: Optional[Lsn] = None,
|
||||
@@ -433,7 +435,7 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
def timeline_detail(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
include_non_incremental_logical_size: bool = False,
|
||||
include_timeline_dir_layer_file_size_sum: bool = False,
|
||||
@@ -455,7 +457,9 @@ class PageserverHttpClient(requests.Session):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs):
|
||||
def timeline_delete(
|
||||
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, **kwargs
|
||||
):
|
||||
"""
|
||||
Note that deletion is not instant, it is scheduled and performed mostly in the background.
|
||||
So if you need to wait for it to complete use `timeline_delete_wait_completed`.
|
||||
@@ -469,7 +473,10 @@ class PageserverHttpClient(requests.Session):
|
||||
assert res_json is None
|
||||
|
||||
def timeline_gc(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
|
||||
self,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
gc_horizon: Optional[int],
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Unlike most handlers, this will wait for the layers to be actually
|
||||
@@ -492,7 +499,7 @@ class PageserverHttpClient(requests.Session):
|
||||
return res_json
|
||||
|
||||
def timeline_compact(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, force_repartition=False
|
||||
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, force_repartition=False
|
||||
):
|
||||
self.is_testing_enabled_or_skip()
|
||||
query = {}
|
||||
@@ -511,7 +518,7 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
def timeline_get_lsn_by_timestamp(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
timestamp,
|
||||
version: Optional[int] = None,
|
||||
@@ -530,7 +537,9 @@ class PageserverHttpClient(requests.Session):
|
||||
res_json = res.json()
|
||||
return res_json
|
||||
|
||||
def timeline_get_timestamp_of_lsn(self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
|
||||
def timeline_get_timestamp_of_lsn(
|
||||
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
|
||||
):
|
||||
log.info(f"Requesting time range of lsn {lsn}, tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn?lsn={lsn}",
|
||||
@@ -540,7 +549,7 @@ class PageserverHttpClient(requests.Session):
|
||||
return res_json
|
||||
|
||||
def timeline_checkpoint(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, force_repartition=False
|
||||
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, force_repartition=False
|
||||
):
|
||||
self.is_testing_enabled_or_skip()
|
||||
query = {}
|
||||
@@ -559,7 +568,7 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
def timeline_spawn_download_remote_layers(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
max_concurrent_downloads: int,
|
||||
) -> dict[str, Any]:
|
||||
@@ -578,7 +587,7 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
def timeline_poll_download_remote_layers_status(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
spawn_response: dict[str, Any],
|
||||
poll_state=None,
|
||||
@@ -600,7 +609,7 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
def timeline_download_remote_layers(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
max_concurrent_downloads: int,
|
||||
errors_ok=False,
|
||||
@@ -682,9 +691,37 @@ class PageserverHttpClient(requests.Session):
|
||||
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
|
||||
return results[0].value
|
||||
|
||||
def get_metrics_values(
|
||||
self, names: list[str], filter: Optional[Dict[str, str]] = None
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
When fetching multiple named metrics, it is more efficient to use this
|
||||
than to call `get_metric_value` repeatedly.
|
||||
|
||||
Throws RuntimeError if no metrics matching `names` are found, or if
|
||||
not all of `names` are found: this method is intended for loading sets
|
||||
of metrics whose existence is coupled.
|
||||
"""
|
||||
metrics = self.get_metrics()
|
||||
samples = []
|
||||
for name in names:
|
||||
samples.extend(metrics.query_all(name, filter=filter))
|
||||
|
||||
result = {}
|
||||
for sample in samples:
|
||||
if sample.name in result:
|
||||
raise RuntimeError(f"Multiple values found for {sample.name}")
|
||||
result[sample.name] = sample.value
|
||||
|
||||
if len(result) != len(names):
|
||||
log.info(f"Metrics found: {metrics.metrics}")
|
||||
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
|
||||
|
||||
return result
|
||||
|
||||
def layer_map_info(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> LayerMapInfo:
|
||||
res = self.get(
|
||||
@@ -693,7 +730,9 @@ class PageserverHttpClient(requests.Session):
|
||||
self.verbose_error(res)
|
||||
return LayerMapInfo.from_json(res.json())
|
||||
|
||||
def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
|
||||
def download_layer(
|
||||
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str
|
||||
):
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
|
||||
)
|
||||
@@ -701,14 +740,16 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
assert res.status_code == 200
|
||||
|
||||
def download_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
def download_all_layers(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId):
|
||||
info = self.layer_map_info(tenant_id, timeline_id)
|
||||
for layer in info.historic_layers:
|
||||
if not layer.remote:
|
||||
continue
|
||||
self.download_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
|
||||
def evict_layer(
|
||||
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str
|
||||
):
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
|
||||
)
|
||||
@@ -716,7 +757,7 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
assert res.status_code in (200, 304)
|
||||
|
||||
def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
def evict_all_layers(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId):
|
||||
info = self.layer_map_info(tenant_id, timeline_id)
|
||||
for layer in info.historic_layers:
|
||||
self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
@@ -729,7 +770,7 @@ class PageserverHttpClient(requests.Session):
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def tenant_break(self, tenant_id: TenantId):
|
||||
def tenant_break(self, tenant_id: TenantId | TenantShardId):
|
||||
res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
|
||||
self.verbose_error(res)
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.remote_storage import RemoteStorageKind, S3Storage
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ def assert_tenant_state(
|
||||
|
||||
|
||||
def remote_consistent_lsn(
|
||||
pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
|
||||
pageserver_http: PageserverHttpClient, tenant: TenantId | TenantShardId, timeline: TimelineId
|
||||
) -> Lsn:
|
||||
detail = pageserver_http.timeline_detail(tenant, timeline)
|
||||
|
||||
@@ -39,7 +39,7 @@ def remote_consistent_lsn(
|
||||
|
||||
def wait_for_upload(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant: TenantId,
|
||||
tenant: TenantId | TenantShardId,
|
||||
timeline: TimelineId,
|
||||
lsn: Lsn,
|
||||
):
|
||||
@@ -92,7 +92,7 @@ def wait_until_tenant_state(
|
||||
|
||||
def wait_until_timeline_state(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
expected_state: str,
|
||||
iterations: int,
|
||||
@@ -141,7 +141,9 @@ def wait_until_tenant_active(
|
||||
|
||||
|
||||
def last_record_lsn(
|
||||
pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
|
||||
pageserver_http_client: PageserverHttpClient,
|
||||
tenant: TenantId | TenantShardId,
|
||||
timeline: TimelineId,
|
||||
) -> Lsn:
|
||||
detail = pageserver_http_client.timeline_detail(tenant, timeline)
|
||||
|
||||
@@ -152,7 +154,7 @@ def last_record_lsn(
|
||||
|
||||
def wait_for_last_record_lsn(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant: TenantId,
|
||||
tenant: TenantId | TenantShardId,
|
||||
timeline: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Lsn:
|
||||
@@ -194,7 +196,7 @@ def wait_for_upload_queue_empty(
|
||||
|
||||
def wait_timeline_detail_404(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
iterations: int,
|
||||
interval: Optional[float] = None,
|
||||
@@ -219,7 +221,7 @@ def wait_timeline_detail_404(
|
||||
|
||||
def timeline_delete_wait_completed(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
iterations: int = 20,
|
||||
interval: Optional[float] = None,
|
||||
|
||||
@@ -5,6 +5,7 @@ from fixtures.neon_fixtures import (
|
||||
Endpoint,
|
||||
NeonEnv,
|
||||
last_flush_lsn_upload,
|
||||
tenant_get_shards,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
@@ -31,7 +32,7 @@ class Workload:
|
||||
|
||||
self._endpoint: Optional[Endpoint] = None
|
||||
|
||||
def endpoint(self, pageserver_id: int) -> Endpoint:
|
||||
def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
|
||||
if self._endpoint is None:
|
||||
self._endpoint = self.env.endpoints.create(
|
||||
"main",
|
||||
@@ -54,7 +55,7 @@ class Workload:
|
||||
if self._endpoint is not None:
|
||||
self._endpoint.stop()
|
||||
|
||||
def init(self, pageserver_id: int):
|
||||
def init(self, pageserver_id: Optional[int] = None):
|
||||
endpoint = self.endpoint(pageserver_id)
|
||||
|
||||
endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
|
||||
@@ -63,7 +64,7 @@ class Workload:
|
||||
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
|
||||
)
|
||||
|
||||
def write_rows(self, n, pageserver_id):
|
||||
def write_rows(self, n, pageserver_id: Optional[int] = None):
|
||||
endpoint = self.endpoint(pageserver_id)
|
||||
start = self.expect_rows
|
||||
end = start + n - 1
|
||||
@@ -81,7 +82,7 @@ class Workload:
|
||||
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
|
||||
)
|
||||
|
||||
def churn_rows(self, n, pageserver_id, upload=True):
|
||||
def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
|
||||
assert self.expect_rows >= n
|
||||
|
||||
max_iters = 10
|
||||
@@ -119,21 +120,24 @@ class Workload:
|
||||
]
|
||||
)
|
||||
|
||||
last_flush_lsn = wait_for_last_flush_lsn(
|
||||
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
|
||||
)
|
||||
ps_http = self.env.get_pageserver(pageserver_id).http_client()
|
||||
wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
|
||||
for tenant_shard_id, pageserver in tenant_get_shards(
|
||||
self.env, self.tenant_id, pageserver_id
|
||||
):
|
||||
last_flush_lsn = wait_for_last_flush_lsn(
|
||||
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
|
||||
)
|
||||
ps_http = pageserver.http_client()
|
||||
wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
|
||||
|
||||
if upload:
|
||||
# force a checkpoint to trigger upload
|
||||
ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id)
|
||||
wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
|
||||
log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
|
||||
else:
|
||||
log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
|
||||
if upload:
|
||||
# force a checkpoint to trigger upload
|
||||
ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
|
||||
wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
|
||||
log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
|
||||
else:
|
||||
log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
|
||||
|
||||
def validate(self, pageserver_id):
|
||||
def validate(self, pageserver_id: Optional[int] = None):
|
||||
endpoint = self.endpoint(pageserver_id)
|
||||
result = endpoint.safe_psql_many(
|
||||
[
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import random
|
||||
from contextlib import closing
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
@@ -141,18 +143,24 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
||||
# Test that repeatedly kills and restarts the page server, while the
|
||||
# safekeeper and compute node keep running.
|
||||
@pytest.mark.timeout(540)
|
||||
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
|
||||
@pytest.mark.parametrize("shard_count", [None, 4])
|
||||
def test_pageserver_chaos(
|
||||
neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int]
|
||||
):
|
||||
if build_type == "debug":
|
||||
pytest.skip("times out in debug builds")
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
if shard_count is not None:
|
||||
neon_env_builder.num_pageservers = shard_count
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
|
||||
|
||||
# these can happen, if we shutdown at a good time. to be fixed as part of #5172.
|
||||
message = ".*duplicated L1 layer layer=.*"
|
||||
env.pageserver.allowed_errors.append(message)
|
||||
for ps in env.pageservers:
|
||||
ps.allowed_errors.append(message)
|
||||
|
||||
# Use a tiny checkpoint distance, to create a lot of layers quickly.
|
||||
# That allows us to stress the compaction and layer flushing logic more.
|
||||
@@ -192,13 +200,19 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
|
||||
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
# We run "random" kills using a fixed seed, to improve reproducibility if a test
|
||||
# failure is related to a particular order of operations.
|
||||
seed = 0xDEADBEEF
|
||||
rng = random.Random(seed)
|
||||
|
||||
# Update the whole table, then immediately kill and restart the pageserver
|
||||
for i in range(1, 15):
|
||||
endpoint.safe_psql("UPDATE foo set updates = updates + 1")
|
||||
|
||||
# This kills the pageserver immediately, to simulate a crash
|
||||
env.pageserver.stop(immediate=True)
|
||||
env.pageserver.start()
|
||||
to_kill = rng.choice(env.pageservers)
|
||||
to_kill.stop(immediate=True)
|
||||
to_kill.start()
|
||||
|
||||
# Check that all the updates are visible
|
||||
num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]
|
||||
|
||||
@@ -2,25 +2,40 @@
|
||||
# This file runs pg_regress-based tests.
|
||||
#
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
check_restored_datadir_content,
|
||||
)
|
||||
from fixtures.remote_storage import s3_storage
|
||||
|
||||
|
||||
# Run the main PostgreSQL regression tests, in src/test/regress.
|
||||
#
|
||||
@pytest.mark.parametrize("shard_count", [None, 4])
|
||||
def test_pg_regress(
|
||||
neon_simple_env: NeonEnv,
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
pg_bin,
|
||||
capsys,
|
||||
base_dir: Path,
|
||||
pg_distrib_dir: Path,
|
||||
shard_count: Optional[int],
|
||||
):
|
||||
env = neon_simple_env
|
||||
"""
|
||||
:param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this
|
||||
many shards.
|
||||
"""
|
||||
if shard_count is not None:
|
||||
neon_env_builder.num_pageservers = shard_count
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
|
||||
|
||||
env.neon_cli.create_branch("test_pg_regress", "empty")
|
||||
# Connect to postgres and create a database called "regression".
|
||||
endpoint = env.endpoints.create_start("test_pg_regress")
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
endpoint.safe_psql("CREATE DATABASE regression")
|
||||
|
||||
# Create some local directories for pg_regress to run in.
|
||||
@@ -61,22 +76,25 @@ def test_pg_regress(
|
||||
|
||||
# Run the PostgreSQL "isolation" tests, in src/test/isolation.
|
||||
#
|
||||
@pytest.mark.parametrize("shard_count", [None, 4])
|
||||
def test_isolation(
|
||||
neon_simple_env: NeonEnv,
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
pg_bin,
|
||||
capsys,
|
||||
base_dir: Path,
|
||||
pg_distrib_dir: Path,
|
||||
shard_count: Optional[int],
|
||||
):
|
||||
env = neon_simple_env
|
||||
if shard_count is not None:
|
||||
neon_env_builder.num_pageservers = shard_count
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
|
||||
|
||||
env.neon_cli.create_branch("test_isolation", "empty")
|
||||
# Connect to postgres and create a database called "regression".
|
||||
# isolation tests use prepared transactions, so enable them
|
||||
endpoint = env.endpoints.create_start(
|
||||
"test_isolation", config_lines=["max_prepared_transactions=100"]
|
||||
)
|
||||
endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
|
||||
endpoint.safe_psql("CREATE DATABASE isolation_regression")
|
||||
|
||||
# Create some local directories for pg_isolation_regress to run in.
|
||||
@@ -114,19 +132,24 @@ def test_isolation(
|
||||
|
||||
# Run extra Neon-specific pg_regress-based tests. The tests and their
|
||||
# schedule file are in the sql_regress/ directory.
|
||||
@pytest.mark.parametrize("shard_count", [None, 4])
|
||||
def test_sql_regress(
|
||||
neon_simple_env: NeonEnv,
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
pg_bin,
|
||||
capsys,
|
||||
base_dir: Path,
|
||||
pg_distrib_dir: Path,
|
||||
shard_count: Optional[int],
|
||||
):
|
||||
env = neon_simple_env
|
||||
if shard_count is not None:
|
||||
neon_env_builder.num_pageservers = shard_count
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
|
||||
|
||||
env.neon_cli.create_branch("test_sql_regress", "empty")
|
||||
# Connect to postgres and create a database called "regression".
|
||||
endpoint = env.endpoints.create_start("test_sql_regress")
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
endpoint.safe_psql("CREATE DATABASE regression")
|
||||
|
||||
# Create some local directories for pg_regress to run in.
|
||||
|
||||
144
test_runner/regress/test_sharding.py
Normal file
144
test_runner/regress/test_sharding.py
Normal file
@@ -0,0 +1,144 @@
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
tenant_get_shards,
|
||||
)
|
||||
from fixtures.remote_storage import s3_storage
|
||||
from fixtures.workload import Workload
|
||||
|
||||
|
||||
def test_sharding_smoke(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Test the basic lifecycle of a sharded tenant:
|
||||
- ingested data gets split up
|
||||
- page service reads
|
||||
- timeline creation and deletion
|
||||
- splits
|
||||
"""
|
||||
|
||||
# We will start with 4 shards and split into 8, then migrate all those
|
||||
# 8 shards onto separate pageservers
|
||||
shard_count = 4
|
||||
split_shard_count = 8
|
||||
neon_env_builder.num_pageservers = split_shard_count
|
||||
|
||||
# 1MiB stripes: enable getting some meaningful data distribution without
|
||||
# writing large quantities of data in this test.
|
||||
stripe_size = 128
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
neon_env_builder.preserve_database_files = True
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
|
||||
)
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
for ps in env.pageservers:
|
||||
ps.allowed_errors.extend(
|
||||
[
|
||||
# FIXME: during a split, control plane should respond affirmatively to validation requests
|
||||
# that refer to a shard that no longer exists, but has a child shard.
|
||||
".*Dropped remote consistent LSN updates.*",
|
||||
# FIXME: improve logging in the pageserver so that this isn't considered an erorr, or
|
||||
# figure out how to make the migration even more seamless.
|
||||
".*Tenant.*is not active.*",
|
||||
]
|
||||
)
|
||||
|
||||
# TODO: do some timeline creations & deletions on the sharded tenant
|
||||
# TODO: validate that timeline APIs show the created timelines on all shards
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(1000)
|
||||
|
||||
# Note which pageservers initially hold a shard after tenant creation
|
||||
pre_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
|
||||
|
||||
# For pageservers holding a shard, validate their ingest statistics
|
||||
# reflect a proper splitting of the WAL.
|
||||
for pageserver in env.pageservers:
|
||||
if pageserver.id not in pre_split_pageserver_ids:
|
||||
continue
|
||||
|
||||
metrics = pageserver.http_client().get_metrics_values(
|
||||
[
|
||||
"pageserver_wal_ingest_records_received_total",
|
||||
"pageserver_wal_ingest_records_committed_total",
|
||||
"pageserver_wal_ingest_records_filtered_total",
|
||||
]
|
||||
)
|
||||
|
||||
log.info(f"Pageserver {pageserver.id} metrics: {metrics}")
|
||||
|
||||
# Not everything received was committed
|
||||
assert (
|
||||
metrics["pageserver_wal_ingest_records_received_total"]
|
||||
> metrics["pageserver_wal_ingest_records_committed_total"]
|
||||
)
|
||||
|
||||
# Something was committed
|
||||
assert metrics["pageserver_wal_ingest_records_committed_total"] > 0
|
||||
|
||||
# Counts are self consistent
|
||||
assert (
|
||||
metrics["pageserver_wal_ingest_records_received_total"]
|
||||
== metrics["pageserver_wal_ingest_records_committed_total"]
|
||||
+ metrics["pageserver_wal_ingest_records_filtered_total"]
|
||||
)
|
||||
|
||||
# TODO: validate that shards have different sizes
|
||||
|
||||
workload.validate()
|
||||
|
||||
assert len(pre_split_pageserver_ids) == 4
|
||||
|
||||
env.attachment_service.tenant_shard_split(tenant_id, shard_count=split_shard_count)
|
||||
|
||||
post_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
|
||||
# We should have split into 8 shards, on the same 4 pageservers we started on.
|
||||
assert len(post_split_pageserver_ids) == split_shard_count
|
||||
assert len(set(post_split_pageserver_ids)) == shard_count
|
||||
assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids)
|
||||
|
||||
workload.validate()
|
||||
|
||||
workload.churn_rows(1000)
|
||||
|
||||
workload.validate()
|
||||
|
||||
# Run GC on all new shards, to check they don't barf or delete anything that breaks reads
|
||||
# (compaction was already run as part of churn_rows)
|
||||
all_shards = tenant_get_shards(env, tenant_id)
|
||||
for tenant_shard_id, pageserver in all_shards:
|
||||
pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None)
|
||||
|
||||
# Restart all nodes, to check that the newly created shards are durable
|
||||
for ps in env.pageservers:
|
||||
ps.restart()
|
||||
|
||||
workload.validate()
|
||||
|
||||
migrate_to_pageserver_ids = list(
|
||||
set(p.id for p in env.pageservers) - set(pre_split_pageserver_ids)
|
||||
)
|
||||
assert len(migrate_to_pageserver_ids) == split_shard_count - shard_count
|
||||
|
||||
# Migrate shards away from the node where the split happened
|
||||
for ps_id in pre_split_pageserver_ids:
|
||||
shards_here = [
|
||||
tenant_shard_id
|
||||
for (tenant_shard_id, pageserver) in all_shards
|
||||
if pageserver.id == ps_id
|
||||
]
|
||||
assert len(shards_here) == 2
|
||||
migrate_shard = shards_here[0]
|
||||
destination = migrate_to_pageserver_ids.pop()
|
||||
|
||||
log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
|
||||
env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
|
||||
Reference in New Issue
Block a user