mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-20 03:42:55 +00:00
Compare commits
3 Commits
release-64
...
proxy-http
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0e551edb06 | ||
|
|
484cdccbf2 | ||
|
|
39d1b78817 |
@@ -379,7 +379,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
match kill(pid, None) {
|
||||
// Process exists, keep waiting
|
||||
Ok(_) => Ok(false),
|
||||
|
||||
@@ -15,9 +15,7 @@ use control_plane::local_env::{
|
||||
};
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::{
|
||||
NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
|
||||
};
|
||||
use control_plane::storage_controller::StorageController;
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::config::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
@@ -1054,36 +1052,6 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration {
|
||||
humantime_duration.as_ref()
|
||||
}
|
||||
|
||||
fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
|
||||
let maybe_instance_id = args.get_one::<u8>("instance-id");
|
||||
|
||||
let base_port = args.get_one::<u16>("base-port");
|
||||
|
||||
if maybe_instance_id.is_some() && base_port.is_none() {
|
||||
panic!("storage-controller start specificied instance-id but did not provide base-port");
|
||||
}
|
||||
|
||||
let start_timeout = args
|
||||
.get_one::<humantime::Duration>("start-timeout")
|
||||
.expect("invalid value for start-timeout");
|
||||
|
||||
NeonStorageControllerStartArgs {
|
||||
instance_id: maybe_instance_id.copied().unwrap_or(1),
|
||||
base_port: base_port.copied(),
|
||||
start_timeout: *start_timeout,
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
|
||||
let maybe_instance_id = args.get_one::<u8>("instance-id");
|
||||
let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||
|
||||
NeonStorageControllerStopArgs {
|
||||
instance_id: maybe_instance_id.copied().unwrap_or(1),
|
||||
immediate,
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
@@ -1145,14 +1113,19 @@ async fn handle_storage_controller(
|
||||
let svc = StorageController::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", start_match)) => {
|
||||
if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
|
||||
if let Err(e) = svc.start(get_start_timeout(start_match)).await {
|
||||
eprintln!("start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Some(("stop", stop_match)) => {
|
||||
if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
|
||||
let immediate = stop_match
|
||||
.get_one::<String>("stop-mode")
|
||||
.map(|s| s.as_str())
|
||||
== Some("immediate");
|
||||
|
||||
if let Err(e) = svc.stop(immediate).await {
|
||||
eprintln!("stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1255,12 +1228,7 @@ async fn handle_start_all(
|
||||
// Only start the storage controller if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller
|
||||
.start(NeonStorageControllerStartArgs::with_default_instance_id(
|
||||
(*retry_timeout).into(),
|
||||
))
|
||||
.await
|
||||
{
|
||||
if let Err(e) = storage_controller.start(retry_timeout).await {
|
||||
eprintln!("storage_controller start failed: {:#}", e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
@@ -1390,21 +1358,10 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
eprintln!("neon broker stop failed: {e:#}");
|
||||
}
|
||||
|
||||
// Stop all storage controller instances. In the most common case there's only one,
|
||||
// but iterate though the base data directory in order to discover the instances.
|
||||
let storcon_instances = env
|
||||
.storage_controller_instances()
|
||||
.await
|
||||
.expect("Must inspect data dir");
|
||||
for (instance_id, _instance_dir_path) in storcon_instances {
|
||||
if env.control_plane_api.is_some() {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let stop_args = NeonStorageControllerStopArgs {
|
||||
instance_id,
|
||||
immediate,
|
||||
};
|
||||
|
||||
if let Err(e) = storage_controller.stop(stop_args).await {
|
||||
eprintln!("Storage controller instance {instance_id} stop failed: {e:#}");
|
||||
if let Err(e) = storage_controller.stop(immediate).await {
|
||||
eprintln!("storage controller stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1544,18 +1501,6 @@ fn cli() -> Command {
|
||||
.action(ArgAction::SetTrue)
|
||||
.required(false);
|
||||
|
||||
let instance_id = Arg::new("instance-id")
|
||||
.long("instance-id")
|
||||
.help("Identifier used to distinguish storage controller instances (default 1)")
|
||||
.value_parser(value_parser!(u8))
|
||||
.required(false);
|
||||
|
||||
let base_port = Arg::new("base-port")
|
||||
.long("base-port")
|
||||
.help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
|
||||
.value_parser(value_parser!(u16))
|
||||
.required(false);
|
||||
|
||||
Command::new("Neon CLI")
|
||||
.arg_required_else_help(true)
|
||||
.version(GIT_VERSION)
|
||||
@@ -1664,12 +1609,9 @@ fn cli() -> Command {
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage storage_controller")
|
||||
.subcommand(Command::new("start").about("Start storage controller")
|
||||
.arg(timeout_arg.clone())
|
||||
.arg(instance_id.clone())
|
||||
.arg(base_port))
|
||||
.arg(timeout_arg.clone()))
|
||||
.subcommand(Command::new("stop").about("Stop storage controller")
|
||||
.arg(stop_mode_arg.clone())
|
||||
.arg(instance_id))
|
||||
.arg(stop_mode_arg.clone()))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("safekeeper")
|
||||
|
||||
@@ -156,11 +156,6 @@ pub struct NeonStorageControllerConf {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub max_warming_up: Duration,
|
||||
|
||||
pub start_as_candidate: bool,
|
||||
|
||||
/// Database url used when running multiple storage controller instances
|
||||
pub database_url: Option<SocketAddr>,
|
||||
|
||||
/// Threshold for auto-splitting a tenant into shards
|
||||
pub split_threshold: Option<u64>,
|
||||
|
||||
@@ -179,8 +174,6 @@ impl Default for NeonStorageControllerConf {
|
||||
Self {
|
||||
max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
|
||||
max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
|
||||
start_as_candidate: false,
|
||||
database_url: None,
|
||||
split_threshold: None,
|
||||
max_secondary_lag_bytes: None,
|
||||
}
|
||||
@@ -399,36 +392,6 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
/// Inspect the base data directory and extract the instance id and instance directory path
|
||||
/// for all storage controller instances
|
||||
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
|
||||
let mut instances = Vec::default();
|
||||
|
||||
let dir = std::fs::read_dir(self.base_data_dir.clone())?;
|
||||
for dentry in dir {
|
||||
let dentry = dentry?;
|
||||
let is_dir = dentry.metadata()?.is_dir();
|
||||
let filename = dentry.file_name().into_string().unwrap();
|
||||
let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
|
||||
Some(suffix) => suffix.parse::<u8>().ok(),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let is_instance_dir = is_dir && parsed_instance_id.is_some();
|
||||
|
||||
if !is_instance_dir {
|
||||
continue;
|
||||
}
|
||||
|
||||
instances.push((
|
||||
parsed_instance_id.expect("Checked previously"),
|
||||
dentry.path(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(instances)
|
||||
}
|
||||
|
||||
pub fn register_branch_mapping(
|
||||
&mut self,
|
||||
branch_name: String,
|
||||
|
||||
@@ -3,8 +3,6 @@ use crate::{
|
||||
local_env::{LocalEnv, NeonStorageControllerConf},
|
||||
};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hyper::Uri;
|
||||
use nix::unistd::Pid;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
@@ -20,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Method;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
|
||||
use std::{fs, str::FromStr, time::Duration};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
@@ -31,14 +29,12 @@ use utils::{
|
||||
|
||||
pub struct StorageController {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
private_key: Option<Vec<u8>>,
|
||||
public_key: Option<String>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
config: NeonStorageControllerConf,
|
||||
|
||||
// The listen addresses is learned when starting the storage controller,
|
||||
// hence the use of OnceLock to init it at the right time.
|
||||
listen: OnceLock<SocketAddr>,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "storage_controller";
|
||||
@@ -47,36 +43,6 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
|
||||
pub struct NeonStorageControllerStartArgs {
|
||||
pub instance_id: u8,
|
||||
pub base_port: Option<u16>,
|
||||
pub start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerStartArgs {
|
||||
pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
|
||||
Self {
|
||||
instance_id: 1,
|
||||
base_port: None,
|
||||
start_timeout,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct NeonStorageControllerStopArgs {
|
||||
pub instance_id: u8,
|
||||
pub immediate: bool,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerStopArgs {
|
||||
pub fn with_default_instance_id(immediate: bool) -> Self {
|
||||
Self {
|
||||
instance_id: 1,
|
||||
immediate,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -101,6 +67,23 @@ pub struct InspectResponse {
|
||||
|
||||
impl StorageController {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||
// pageservers have control plane API set
|
||||
let listen_url = env.control_plane_api.clone().unwrap();
|
||||
|
||||
let listen = format!(
|
||||
"{}:{}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
|
||||
// port, for use by our captive postgres.
|
||||
let postgres_port = listen_url
|
||||
.port()
|
||||
.expect("Control plane API setting should always have a port")
|
||||
+ 1;
|
||||
|
||||
// Assume all pageservers have symmetric auth configuration: this service
|
||||
// expects to use one JWT token to talk to all of them.
|
||||
let ps_conf = env
|
||||
@@ -143,28 +126,20 @@ impl StorageController {
|
||||
|
||||
Self {
|
||||
env: env.clone(),
|
||||
listen,
|
||||
private_key,
|
||||
public_key,
|
||||
postgres_port,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
config: env.storage_controller.clone(),
|
||||
listen: OnceLock::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
|
||||
self.env
|
||||
.base_data_dir
|
||||
.join(format!("storage_controller_{}", instance_id))
|
||||
}
|
||||
|
||||
fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(
|
||||
self.storage_controller_instance_dir(instance_id)
|
||||
.join("storage_controller.pid"),
|
||||
)
|
||||
.expect("non-Unicode path")
|
||||
fn pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// PIDFile for the postgres instance used to store storage controller state
|
||||
@@ -209,9 +184,9 @@ impl StorageController {
|
||||
}
|
||||
|
||||
/// Readiness check for our postgres process
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
|
||||
let bin_path = pg_bin_dir.join("pg_isready");
|
||||
let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
|
||||
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
|
||||
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
|
||||
|
||||
Ok(exitcode.success())
|
||||
@@ -224,8 +199,8 @@ impl StorageController {
|
||||
/// who just want to run `cargo neon_local` without knowing about diesel.
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let createdb_path = pg_bin_dir.join("createdb");
|
||||
@@ -234,7 +209,7 @@ impl StorageController {
|
||||
"-h",
|
||||
"localhost",
|
||||
"-p",
|
||||
&format!("{}", postgres_port),
|
||||
&format!("{}", self.postgres_port),
|
||||
DB_NAME,
|
||||
])
|
||||
.output()
|
||||
@@ -255,14 +230,13 @@ impl StorageController {
|
||||
|
||||
pub async fn connect_to_database(
|
||||
&self,
|
||||
postgres_port: u16,
|
||||
) -> anyhow::Result<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
)> {
|
||||
tokio_postgres::Config::new()
|
||||
.host("localhost")
|
||||
.port(postgres_port)
|
||||
.port(self.postgres_port)
|
||||
// The user is the ambient operating system user name.
|
||||
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
|
||||
//
|
||||
@@ -278,115 +252,72 @@ impl StorageController {
|
||||
.map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
|
||||
let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
|
||||
if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
|
||||
if err.kind() != std::io::ErrorKind::AlreadyExists {
|
||||
panic!("Failed to create instance dir {instance_dir:?}");
|
||||
}
|
||||
}
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_lib_dir = self.get_pg_lib_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
let (listen, postgres_port) = {
|
||||
if let Some(base_port) = start_args.base_port {
|
||||
(
|
||||
format!("127.0.0.1:{base_port}"),
|
||||
self.config
|
||||
.database_url
|
||||
.expect("--base-port requires NeonStorageControllerConf::database_url")
|
||||
.port(),
|
||||
)
|
||||
} else {
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
|
||||
let listen = format!(
|
||||
"{}:{}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
(listen, listen_url.port().unwrap() + 1)
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.envs(vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
])
|
||||
.args(["-D", pg_data_path.as_ref()])
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
let status = child.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
};
|
||||
|
||||
let socket_addr = listen
|
||||
.parse()
|
||||
.expect("listen address is a valid socket address");
|
||||
self.listen
|
||||
.set(socket_addr)
|
||||
.expect("StorageController::listen is only set here");
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
//
|
||||
// NB: it's important that we rewrite this file on each start command so we propagate changes
|
||||
// from `LocalEnv`'s config file (`.neon/config`).
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Do we remove the pid file on stop?
|
||||
let pg_started = self.is_postgres_running().await?;
|
||||
let pg_lib_dir = self.get_pg_lib_dir().await?;
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"-l",
|
||||
pg_log_path.as_ref(),
|
||||
"start",
|
||||
];
|
||||
|
||||
if !pg_started {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
background_process::start_process(
|
||||
"storage_controller_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
retry_timeout,
|
||||
|| self.pg_isready(&pg_bin_dir),
|
||||
)
|
||||
.await?;
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.envs(vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
])
|
||||
.args(["-D", pg_data_path.as_ref()])
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
let status = child.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
};
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
//
|
||||
// NB: it's important that we rewrite this file on each start command so we propagate changes
|
||||
// from `LocalEnv`'s config file (`.neon/config`).
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", postgres_port),
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"-l",
|
||||
pg_log_path.as_ref(),
|
||||
"start",
|
||||
];
|
||||
|
||||
background_process::start_process(
|
||||
"storage_controller_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
&start_args.start_timeout,
|
||||
|| self.pg_isready(&pg_bin_dir, postgres_port),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Run migrations on every startup, in case something changed.
|
||||
self.setup_database(postgres_port).await?;
|
||||
}
|
||||
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
// We support running a startup SQL script to fiddle with the database before we launch storcon.
|
||||
// This is used by the test suite.
|
||||
@@ -408,7 +339,7 @@ impl StorageController {
|
||||
}
|
||||
}
|
||||
};
|
||||
let (mut client, conn) = self.connect_to_database(postgres_port).await?;
|
||||
let (mut client, conn) = self.connect_to_database().await?;
|
||||
let conn = tokio::spawn(conn);
|
||||
let tx = client.build_transaction();
|
||||
let tx = tx.start().await?;
|
||||
@@ -417,20 +348,9 @@ impl StorageController {
|
||||
drop(client);
|
||||
conn.await??;
|
||||
|
||||
let listen = self
|
||||
.listen
|
||||
.get()
|
||||
.expect("cell is set earlier in this function");
|
||||
let address_for_peers = Uri::builder()
|
||||
.scheme("http")
|
||||
.authority(format!("{}:{}", listen.ip(), listen.port()))
|
||||
.path_and_query("")
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&listen.to_string(),
|
||||
&self.listen,
|
||||
"--dev",
|
||||
"--database-url",
|
||||
&database_url,
|
||||
@@ -438,17 +358,10 @@ impl StorageController {
|
||||
&humantime::Duration::from(self.config.max_offline).to_string(),
|
||||
"--max-warming-up-interval",
|
||||
&humantime::Duration::from(self.config.max_warming_up).to_string(),
|
||||
"--address-for-peers",
|
||||
&address_for_peers.to_string(),
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if self.config.start_as_candidate {
|
||||
args.push("--start-as-candidate".to_string());
|
||||
}
|
||||
|
||||
if let Some(private_key) = &self.private_key {
|
||||
let claims = Claims::new(None, Scope::PageServerApi);
|
||||
let jwt_token =
|
||||
@@ -481,15 +394,15 @@ impl StorageController {
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&instance_dir,
|
||||
&self.env.base_data_dir,
|
||||
&self.env.storage_controller_bin(),
|
||||
args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
|
||||
&start_args.start_timeout,
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
match self.ready().await {
|
||||
Ok(_) => Ok(true),
|
||||
@@ -502,35 +415,8 @@ impl StorageController {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
|
||||
background_process::stop_process(
|
||||
stop_args.immediate,
|
||||
COMMAND,
|
||||
&self.pid_file(stop_args.instance_id),
|
||||
)?;
|
||||
|
||||
let storcon_instances = self.env.storage_controller_instances().await?;
|
||||
for (instance_id, instanced_dir_path) in storcon_instances {
|
||||
if instance_id == stop_args.instance_id {
|
||||
continue;
|
||||
}
|
||||
|
||||
let pid_file = instanced_dir_path.join("storage_controller.pid");
|
||||
let pid = tokio::fs::read_to_string(&pid_file)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
|
||||
})?
|
||||
.parse::<i32>()
|
||||
.expect("pid is valid i32");
|
||||
|
||||
let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
|
||||
if other_proc_alive {
|
||||
// There is another storage controller instance running, so we return
|
||||
// and leave the database running.
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
@@ -543,51 +429,27 @@ impl StorageController {
|
||||
.wait()
|
||||
.await?;
|
||||
if !stop_status.success() {
|
||||
match self.is_postgres_running().await {
|
||||
Ok(false) => {
|
||||
println!("Storage controller database is already stopped");
|
||||
return Ok(());
|
||||
}
|
||||
Ok(true) => {
|
||||
anyhow::bail!("Failed to stop storage controller database");
|
||||
}
|
||||
Err(err) => {
|
||||
anyhow::bail!("Failed to stop storage controller database: {err}");
|
||||
}
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||
println!("Storage controller database is already stopped");
|
||||
return Ok(());
|
||||
} else {
|
||||
anyhow::bail!("Failed to stop storage controller database: {stop_status}")
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn is_postgres_running(&self) -> anyhow::Result<bool> {
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
const PG_NO_DATA_DIR: i32 = 4;
|
||||
const PG_STATUS_RUNNING: i32 = 0;
|
||||
match status_exitcode.code() {
|
||||
Some(PG_STATUS_NOT_RUNNING) => Ok(false),
|
||||
Some(PG_NO_DATA_DIR) => Ok(false),
|
||||
Some(PG_STATUS_RUNNING) => Ok(true),
|
||||
Some(code) => Err(anyhow::anyhow!(
|
||||
"pg_ctl status returned unexpected status code: {:?}",
|
||||
code
|
||||
)),
|
||||
None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
|
||||
let category = match path.find('/') {
|
||||
Some(idx) => &path[..idx],
|
||||
@@ -613,31 +475,15 @@ impl StorageController {
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
// In the special case of the `storage_controller start` subcommand, we wish
|
||||
// to use the API endpoint of the newly started storage controller in order
|
||||
// to pass the readiness check. In this scenario [`Self::listen`] will be set
|
||||
// (see [`Self::start`]).
|
||||
//
|
||||
// Otherwise, we infer the storage controller api endpoint from the configured
|
||||
// control plane API.
|
||||
let url = if let Some(socket_addr) = self.listen.get() {
|
||||
Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
socket_addr.ip().to_canonical(),
|
||||
socket_addr.port()
|
||||
))
|
||||
.unwrap()
|
||||
} else {
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
))
|
||||
.unwrap()
|
||||
};
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
|
||||
@@ -1,265 +0,0 @@
|
||||
# Physical Replication
|
||||
|
||||
This RFC is a bit special in that we have already implemented physical
|
||||
replication a long time ago. However, we never properly wrote down all
|
||||
the decisions and assumptions, and in the last months when more users
|
||||
have started to use the feature, numerous issues have surfaced.
|
||||
|
||||
This RFC documents the design decisions that have been made.
|
||||
|
||||
## Summary
|
||||
|
||||
PostgreSQL has a feature called streaming replication, where a replica
|
||||
streams WAL from the primary and continuously applies it. It is also
|
||||
known as "physical replication", to distinguish it from logical
|
||||
replication. In PostgreSQL, a replica is initialized by taking a
|
||||
physical backup of the primary. In Neon, the replica is initialized
|
||||
from a slim "base backup" from the pageserver, just like a primary,
|
||||
and the primary and the replicas connect to the same pageserver,
|
||||
sharing the storage.
|
||||
|
||||
There are two kinds of read-only replicas in Neon:
|
||||
- replicas that follow the primary, and
|
||||
- "static" replicas that are pinned at a particular LSN.
|
||||
|
||||
A static replica is useful e.g. for performing time-travel queries and
|
||||
running one-off slow queries without affecting the primary. A replica
|
||||
that follows the primary can be used e.g. to scale out read-only
|
||||
workloads.
|
||||
|
||||
## Motivation
|
||||
|
||||
Read-only replicas allow offloading read-only queries. It's useful for
|
||||
isolation, if you want to make sure that read-only queries don't
|
||||
affect the primary, and it's also an easy way to provide guaranteed
|
||||
read-only access to an application, without having to mess with access
|
||||
controls.
|
||||
|
||||
## Non Goals (if relevant)
|
||||
|
||||
This RFC is all about WAL-based *physical* replication. Logical
|
||||
replication is a different feature.
|
||||
|
||||
Neon also has the capability to launch "static" read-only nodes which
|
||||
do not follow the primary, but are pinned to a particular LSN. They
|
||||
can be used for long-running one-off queries, or for Point-in-time
|
||||
queries. They work similarly to read replicas that follow the primary,
|
||||
but some things are simpler: there are no concerns about cache
|
||||
invalidation when the data changes on the primary, or worrying about
|
||||
transactions that are in-progress on the primary.
|
||||
|
||||
## Impacted components (e.g. pageserver, safekeeper, console, etc)
|
||||
|
||||
- Control plane launches the replica
|
||||
- Replica Postgres instance connects to the safekeepers, to stream the WAL
|
||||
- The primary does not know about the standby, except for the hot standby feedback
|
||||
- The primary and replicas all connect to the same pageservers
|
||||
|
||||
|
||||
# Context
|
||||
|
||||
Some useful things to know about hot standby and replicas in
|
||||
PostgreSQL.
|
||||
|
||||
## PostgreSQL startup sequence
|
||||
|
||||
"Running" and "start up" terms are little imprecise. PostgreSQL
|
||||
replica startup goes through several stages:
|
||||
|
||||
1. First, the process is started up, and various initialization steps
|
||||
are performed, like initializing shared memory. If you try to
|
||||
connect to the server in this stage, you get an error: ERROR: the
|
||||
database system is starting up. This stage happens very quickly, no
|
||||
|
||||
2. Then the server reads the checpoint record from the WAL and starts
|
||||
the WAL replay starting from the checkpoint. This works differently
|
||||
in Neon: we start the WAL replay at the basebackup LSN, not from a
|
||||
checkpoint! If you connect to the server in this state, you get an
|
||||
error: ERROR: the database system is not yet accepting
|
||||
connections. We proceed to the next stage, when the WAL replay sees
|
||||
a running-xacts record. Or in Neon, the "CLOG scanning" mechanism
|
||||
can allow us to move directly to next stage, with all the caveats
|
||||
listed in this RFC.
|
||||
|
||||
3. When the running-xacts information is established, the server
|
||||
starts to accept connections normally.
|
||||
|
||||
From PostgreSQL's point of view, the server is already running in
|
||||
stage 2, even though it's not accepting connections yet. Our
|
||||
`compute_ctl` does not consider it as running until stage 3. If the
|
||||
transition from stage 2 to 3 doesn't happen fast enough, the control
|
||||
plane will mark the start operation as failed.
|
||||
|
||||
|
||||
## Decisions, Issues
|
||||
|
||||
### Cache invalidation in replica
|
||||
|
||||
When a read replica follows the primary in PostgreSQL, it needs to
|
||||
stream all the WAL from the primary and apply all the records, to keep
|
||||
the local copy of the data consistent with the primary. In Neon, the
|
||||
replica can fetch the updated page versions from the pageserver, so
|
||||
it's not necessary to apply all the WAL. However, it needs to ensure
|
||||
that any pages that are currently in the Postgres buffer cache, or the
|
||||
Local File Cache, are either updated, or thrown away so that the next
|
||||
read of the page will fetch the latest version.
|
||||
|
||||
We choose to apply the WAL records for pages that are already in the
|
||||
buffer cache, and skip records for other pages. Somewhat arbitrarily,
|
||||
we also apply records affecting catalog relations, fetching the old
|
||||
page version from the pageserver if necessary first. See
|
||||
`neon_redo_read_buffer_filter()` function.
|
||||
|
||||
The replica wouldn't necessarily need to see all the WAL records, only
|
||||
the records that apply to cached pages. For simplicity, we do stream
|
||||
all the WAL to the replica, and the replica simply ignores WAL records
|
||||
that require no action.
|
||||
|
||||
Like in PostgreSQL, the read replica maintains a "replay LSN", which
|
||||
is the LSN up to which the replica has received and replayed the
|
||||
WAL. The replica can lag behind the primary, if it cannot quite keep
|
||||
up with the primary, or if a long-running query conflicts with changes
|
||||
that are about to be applied, or even intentionally if the user wishes
|
||||
to see delayed data (see recovery_min_apply_delay). It's important
|
||||
that the replica sees a consistent view of the whole cluster at the
|
||||
replay LSN, when it's lagging behind.
|
||||
|
||||
In Neon, the replica connects to a safekeeper to get the WAL
|
||||
stream. That means that the safekeepers must be able to regurgitate
|
||||
the original WAL as far back as the replay LSN of any running read
|
||||
replica. (A static read-only node that does not follow the primary
|
||||
does not require a WAL stream however). The primary does not need to
|
||||
be running, and when it is, the replicas don't incur any extra
|
||||
overhead to the primary (see hot standby feedback though).
|
||||
|
||||
### In-progress transactions
|
||||
|
||||
In PostgreSQL, when a hot standby server starts up, it cannot
|
||||
immediately open up for queries (see [PostgreSQL startup
|
||||
sequence]). It first needs to establish a complete list of in-progress
|
||||
transactions, including subtransactions, that are running at the
|
||||
primary, at the current replay LSN. Normally that happens quickly,
|
||||
when the replica sees a "running-xacts" WAL record, because the
|
||||
primary writes a running-xacts WAL record at every checkpoint, and in
|
||||
PostgreSQL the replica always starts the WAL replay from a checkpoint
|
||||
REDO point. (A shutdown checkpoint WAL record also implies that all
|
||||
the non-prepared transactions have ended.) If there are a lot of
|
||||
subtransactions in progress, however, the standby might need to wait
|
||||
for old transactions to complete before it can open up for queries.
|
||||
|
||||
In Neon that problem is worse: a replica can start at any LSN, so
|
||||
there's no guarantee that it will see a running-xacts record any time
|
||||
soon. In particular, if the primary is not running when the replica is
|
||||
started, it might never see a running-xacts record.
|
||||
|
||||
To make things worse, we initially missed this issue, and always
|
||||
started accepting queries at replica startup, even if it didn't have
|
||||
the transaction information. That could lead to incorrect query
|
||||
results and data corruption later. However, as we fixed that, we
|
||||
introduced a new problem compared to what we had before: previously
|
||||
the replica would always start up, but after fixing that bug, it might
|
||||
not. In a superficial way, the old behavior was better (but could lead
|
||||
to serious issues later!). That made fixing that bug was very hard,
|
||||
because as we fixed it, we made things (superficially) worse for
|
||||
others.
|
||||
|
||||
See https://github.com/neondatabase/neon/pull/7288 which fixed the
|
||||
bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323
|
||||
and https://github.com/neondatabase/neon/pull/8484 to try to claw back
|
||||
the cases that started to cause trouble as fixing it. As of this
|
||||
writing, there are still cases where a replica might not immediately
|
||||
start up, causing the control plane operation to fail, the remaining
|
||||
issues are tracked in https://github.com/neondatabase/neon/issues/6211.
|
||||
|
||||
One long-term fix for this is to switch to using so-called CSN
|
||||
snapshots in read replica. That would make it unnecessary to have the
|
||||
full in-progress transaction list in the replica at startup time. See
|
||||
https://commitfest.postgresql.org/48/4912/ for a work-in-progress
|
||||
patch to upstream to implement that.
|
||||
|
||||
Another thing we could do is to teach the control plane about that
|
||||
distinction between "starting up" and "running but haven't received
|
||||
running-xacts information yet", so that we could keep the replica
|
||||
waiting longer in that stage, and also give any client connections the
|
||||
same `ERROR: the database system is not yet accepting connections`
|
||||
error that you get in standalone PostgreSQL in that state.
|
||||
|
||||
|
||||
### Recovery conflicts and Hot standby feedback
|
||||
|
||||
It's possible that a tuple version is vacuumed away in the primary,
|
||||
even though it is still needed by a running transactions in the
|
||||
replica. This is called a "recovery conflict", and PostgreSQL provides
|
||||
various options for dealing with it. By default, the WAL replay will
|
||||
wait up to 30 s for the conflicting query to finish. After that, it
|
||||
will kill the running query, so that the WAL replay can proceed.
|
||||
|
||||
Another way to avoid the situation is to enable the
|
||||
[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK)
|
||||
option. When it is enabled, the primary will refrain from vacuuming
|
||||
tuples that are still needed in the primary. That means potentially
|
||||
bloating the primary, which violates the usual rule that read replicas
|
||||
don't affect the operations on the primary, which is why it's off by
|
||||
default. We leave it to users to decide if they want to turn it on,
|
||||
same as PostgreSQL.
|
||||
|
||||
Neon supports `hot_standby_feedback` by passing the feedback messages
|
||||
from the replica to the safekeepers, and from safekeepers to the
|
||||
primary.
|
||||
|
||||
### Relationship of settings between primary and replica
|
||||
|
||||
In order to enter hot standby mode, some configuration options need to
|
||||
be set to the same or larger values in the standby, compared to the
|
||||
primary. See [explanation in the PostgreSQL
|
||||
docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN)
|
||||
|
||||
In Neon, we have this problem too. To prevent customers from hitting
|
||||
it, the control plane automatically adjusts the settings of a replica,
|
||||
so that they match or exceed the primary's settings (see
|
||||
https://github.com/neondatabase/cloud/issues/14903). However, you
|
||||
can still hit the issue if the primary is restarted with larger
|
||||
settings, while the replica is running.
|
||||
|
||||
|
||||
### Interaction with Pageserver GC
|
||||
|
||||
The read replica can lag behind the primary. If there are recovery
|
||||
conflicts or the replica cannot keep up for some reason, the lag can
|
||||
in principle grow indefinitely. The replica will issue all GetPage
|
||||
requests to the pageservers at the current replay LSN, and needs to
|
||||
see the old page versions.
|
||||
|
||||
If the retention period in the pageserver is set to be small, it may
|
||||
have already garbage collected away the old page versions. That will
|
||||
cause read errors in the compute, and can mean that the replica cannot
|
||||
make progress with the replication anymore.
|
||||
|
||||
There is a mechanism for replica to pass information about its replay
|
||||
LSN to the pageserver, so that the pageserver refrains from GC'ing
|
||||
data that is still needed by the standby. It's called
|
||||
'standby_horizon' in the pageserver code, see
|
||||
https://github.com/neondatabase/neon/pull/7368. A separate "lease"
|
||||
mechanism also is in the works, where the replica could hold a lease
|
||||
on the old LSN, preventing the pageserver from advancing the GC
|
||||
horizon past that point. The difference is that the standby_horizon
|
||||
mechanism relies on a feedback message from replica to safekeeper,
|
||||
while the least API is exposed directly from the pageserver. A static
|
||||
read-only node is not connected to safekeepers, so it cannot use the
|
||||
standby_horizon mechanism.
|
||||
|
||||
|
||||
### Synchronous replication
|
||||
|
||||
We haven't put any effort into synchronous replication yet.
|
||||
|
||||
PostgreSQL provides multiple levels of synchronicity. In the weaker
|
||||
levels, a transaction is not acknowledged as committed to the client
|
||||
in the primary until the WAL has been streamed to a replica or flushed
|
||||
to disk there. Those modes don't make senses in Neon, because the
|
||||
safekeepers handle durability.
|
||||
|
||||
`synchronous_commit=remote_apply` mode would make sense. In that mode,
|
||||
the commit is not acknowledged to the client until it has been
|
||||
replayed in the replica. That ensures that after commit, you can see
|
||||
the commit in the replica too (aka. read-your-write consistency).
|
||||
@@ -50,6 +50,7 @@ pub mod defaults {
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_PG_LISTEN_PORT,
|
||||
};
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
|
||||
@@ -89,7 +90,8 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||
ImageCompressionAlgorithm::Disabled;
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
|
||||
|
||||
@@ -476,7 +478,7 @@ impl PageServerConfigBuilder {
|
||||
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
|
||||
image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: Set(L0FlushConfig::default()),
|
||||
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
||||
@@ -1063,7 +1065,7 @@ impl PageServerConf {
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant"),
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
@@ -1303,7 +1305,7 @@ background_task_maximum_delay = '334 s'
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
@@ -1376,7 +1378,7 @@ background_task_maximum_delay = '334 s'
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
|
||||
@@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapTimeline {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
pub(super) timeline_id: TimelineId,
|
||||
|
||||
pub(crate) layers: Vec<HeatMapLayer>,
|
||||
pub(super) layers: Vec<HeatMapLayer>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapLayer {
|
||||
pub(crate) name: LayerName,
|
||||
pub(crate) metadata: LayerFileMetadata,
|
||||
pub(super) name: LayerName,
|
||||
pub(super) metadata: LayerFileMetadata,
|
||||
|
||||
#[serde_as(as = "TimestampSeconds<i64>")]
|
||||
pub(super) access_time: SystemTime,
|
||||
|
||||
@@ -208,8 +208,6 @@ impl SplitDeltaLayerWriter {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::{RngCore, SeedableRng};
|
||||
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
@@ -231,10 +229,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn get_large_img() -> Bytes {
|
||||
let mut rng = rand::rngs::SmallRng::seed_from_u64(42);
|
||||
let mut data = vec![0; 8192];
|
||||
rng.fill_bytes(&mut data);
|
||||
data.into()
|
||||
vec![0; 8192].into()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -2977,7 +2977,11 @@ impl Timeline {
|
||||
LayerVisibilityHint::Visible => {
|
||||
// Layer is visible to one or more read LSNs: elegible for inclusion in layer map
|
||||
let last_activity_ts = layer.latest_activity();
|
||||
Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
|
||||
Some(HeatMapLayer::new(
|
||||
layer.layer_desc().layer_name(),
|
||||
layer.metadata(),
|
||||
last_activity_ts,
|
||||
))
|
||||
}
|
||||
LayerVisibilityHint::Covered => {
|
||||
// Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
|
||||
@@ -2986,23 +2990,7 @@ impl Timeline {
|
||||
}
|
||||
});
|
||||
|
||||
let mut layers = resident.collect::<Vec<_>>();
|
||||
|
||||
// Sort layers in order of which to download first. For a large set of layers to download, we
|
||||
// want to prioritize those layers which are most likely to still be in the resident many minutes
|
||||
// or hours later:
|
||||
// - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
|
||||
// only exist for a few minutes before being compacted into L1s.
|
||||
// - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
|
||||
// the layer is likely to be covered by an image layer during compaction.
|
||||
layers.sort_by_key(|(desc, _meta, _atime)| {
|
||||
std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
|
||||
});
|
||||
|
||||
let layers = layers
|
||||
.into_iter()
|
||||
.map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
|
||||
.collect();
|
||||
let layers = resident.collect();
|
||||
|
||||
Some(HeatMapTimeline::new(self.timeline_id, layers))
|
||||
}
|
||||
@@ -4528,7 +4516,6 @@ impl DurationRecorder {
|
||||
/// the layer descriptor requires the user to provide the ranges, which should cover all
|
||||
/// keys specified in the `data` field.
|
||||
#[cfg(test)]
|
||||
#[derive(Clone)]
|
||||
pub struct DeltaLayerTestDesc {
|
||||
pub lsn_range: Range<Lsn>,
|
||||
pub key_range: Range<Key>,
|
||||
@@ -4558,13 +4545,6 @@ impl DeltaLayerTestDesc {
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn layer_name(&self) -> LayerName {
|
||||
LayerName::Delta(super::storage_layer::DeltaLayerName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
@@ -5788,110 +5768,12 @@ fn is_send() {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::key::Key;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{
|
||||
repository::Value,
|
||||
tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
},
|
||||
use crate::tenant::{
|
||||
harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heatmap_generation() {
|
||||
let harness = TenantHarness::create("heatmap_generation").await.unwrap();
|
||||
|
||||
let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x20),
|
||||
vec![(
|
||||
Key::from_hex("620000000033333333444444445500000000").unwrap(),
|
||||
Lsn(0x11),
|
||||
Value::Image(test_img("foo")),
|
||||
)],
|
||||
);
|
||||
let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x20),
|
||||
vec![(
|
||||
Key::from_hex("720000000033333333444444445500000000").unwrap(),
|
||||
Lsn(0x11),
|
||||
Value::Image(test_img("foo")),
|
||||
)],
|
||||
);
|
||||
let l0_delta = DeltaLayerTestDesc::new(
|
||||
Lsn(0x20)..Lsn(0x30),
|
||||
Key::from_hex("000000000000000000000000000000000000").unwrap()
|
||||
..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
|
||||
vec![(
|
||||
Key::from_hex("720000000033333333444444445500000000").unwrap(),
|
||||
Lsn(0x25),
|
||||
Value::Image(test_img("foo")),
|
||||
)],
|
||||
);
|
||||
let delta_layers = vec![
|
||||
covered_delta.clone(),
|
||||
visible_delta.clone(),
|
||||
l0_delta.clone(),
|
||||
];
|
||||
|
||||
let image_layer = (
|
||||
Lsn(0x40),
|
||||
vec![(
|
||||
Key::from_hex("620000000033333333444444445500000000").unwrap(),
|
||||
test_img("bar"),
|
||||
)],
|
||||
);
|
||||
let image_layers = vec![image_layer];
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
delta_layers,
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Layer visibility is an input to heatmap generation, so refresh it first
|
||||
timeline.update_layer_visibility().await.unwrap();
|
||||
|
||||
let heatmap = timeline
|
||||
.generate_heatmap()
|
||||
.await
|
||||
.expect("Infallible while timeline is not shut down");
|
||||
|
||||
assert_eq!(heatmap.timeline_id, timeline.timeline_id);
|
||||
|
||||
// L0 should come last
|
||||
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
|
||||
|
||||
let mut last_lsn = Lsn::MAX;
|
||||
for layer in heatmap.layers {
|
||||
// Covered layer should be omitted
|
||||
assert!(layer.name != covered_delta.layer_name());
|
||||
|
||||
let layer_lsn = match &layer.name {
|
||||
LayerName::Delta(d) => d.lsn_range.end,
|
||||
LayerName::Image(i) => i.lsn,
|
||||
};
|
||||
|
||||
// Apart from L0s, newest Layers should come first
|
||||
if !LayerMap::is_l0(layer.name.key_range()) {
|
||||
assert!(layer_lsn <= last_lsn);
|
||||
last_lsn = layer_lsn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn two_layer_eviction_attempts_at_the_same_time() {
|
||||
let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
|
||||
|
||||
@@ -41,8 +41,6 @@
|
||||
|
||||
#include "hll.h"
|
||||
|
||||
#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
|
||||
|
||||
/*
|
||||
* Local file cache is used to temporary store relations pages in local file system.
|
||||
* All blocks of all relations are stored inside one file and addressed using shared hash map.
|
||||
@@ -53,43 +51,19 @@
|
||||
*
|
||||
* Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
|
||||
* its consistency.
|
||||
|
||||
*
|
||||
* ## Holes
|
||||
*
|
||||
* The LFC can be resized on the fly, up to a maximum size that's determined
|
||||
* at server startup (neon.max_file_cache_size). After server startup, we
|
||||
* expand the underlying file when needed, until it reaches the soft limit
|
||||
* (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink
|
||||
* the LFC by punching holes in the underlying file with a
|
||||
* fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
|
||||
* shrink, but the disk space it uses does.
|
||||
*
|
||||
* Each hole is tracked by a dummy FileCacheEntry, which are kept in the
|
||||
* 'holes' linked list. They are entered into the chunk hash table, with a
|
||||
* special key where the blockNumber is used to store the 'offset' of the
|
||||
* hole, and all other fields are zero. Holes are never looked up in the hash
|
||||
* table, we only enter them there to have a FileCacheEntry that we can keep
|
||||
* in the linked list. If the soft limit is raised again, we reuse the holes
|
||||
* before extending the nominal size of the file.
|
||||
*/
|
||||
|
||||
/* Local file storage allocation chunk.
|
||||
* Should be power of two. Using larger than page chunks can
|
||||
* Should be power of two and not less than 32. Using larger than page chunks can
|
||||
* 1. Reduce hash-map memory footprint: 8TB database contains billion pages
|
||||
* and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
|
||||
* 1Mb chunks can reduce hash map size to 320Mb.
|
||||
* 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
|
||||
*/
|
||||
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
|
||||
/*
|
||||
* Smaller chunk seems to be better for OLTP workload
|
||||
*/
|
||||
// #define BLOCKS_PER_CHUNK 8 /* 64kb chunk */
|
||||
#define MB ((uint64)1024*1024)
|
||||
|
||||
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
|
||||
#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32)
|
||||
|
||||
typedef struct FileCacheEntry
|
||||
{
|
||||
@@ -97,8 +71,8 @@ typedef struct FileCacheEntry
|
||||
uint32 hash;
|
||||
uint32 offset;
|
||||
uint32 access_count;
|
||||
uint32 bitmap[CHUNK_BITMAP_SIZE];
|
||||
dlist_node list_node; /* LRU/holes list node */
|
||||
uint32 bitmap[BLOCKS_PER_CHUNK / 32];
|
||||
dlist_node lru_node; /* LRU list node */
|
||||
} FileCacheEntry;
|
||||
|
||||
typedef struct FileCacheControl
|
||||
@@ -113,7 +87,6 @@ typedef struct FileCacheControl
|
||||
uint64 writes;
|
||||
dlist_head lru; /* double linked list for LRU replacement
|
||||
* algorithm */
|
||||
dlist_head holes; /* double linked list of punched holes */
|
||||
HyperLogLogState wss_estimation; /* estimation of working set size */
|
||||
} FileCacheControl;
|
||||
|
||||
@@ -162,7 +135,6 @@ lfc_disable(char const *op)
|
||||
lfc_ctl->used = 0;
|
||||
lfc_ctl->limit = 0;
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
dlist_init(&lfc_ctl->holes);
|
||||
|
||||
if (lfc_desc > 0)
|
||||
{
|
||||
@@ -242,18 +214,18 @@ lfc_shmem_startup(void)
|
||||
if (!found)
|
||||
{
|
||||
int fd;
|
||||
uint32 n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);
|
||||
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
|
||||
|
||||
lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
|
||||
info.keysize = sizeof(BufferTag);
|
||||
info.entrysize = sizeof(FileCacheEntry);
|
||||
|
||||
/*
|
||||
* n_chunks+1 because we add new element to hash table before eviction
|
||||
* lfc_size+1 because we add new element to hash table before eviction
|
||||
* of victim
|
||||
*/
|
||||
lfc_hash = ShmemInitHash("lfc_hash",
|
||||
n_chunks + 1, n_chunks + 1,
|
||||
lfc_size + 1, lfc_size + 1,
|
||||
&info,
|
||||
HASH_ELEM | HASH_BLOBS);
|
||||
lfc_ctl->generation = 0;
|
||||
@@ -263,7 +235,6 @@ lfc_shmem_startup(void)
|
||||
lfc_ctl->misses = 0;
|
||||
lfc_ctl->writes = 0;
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
dlist_init(&lfc_ctl->holes);
|
||||
|
||||
/* Initialize hyper-log-log structure for estimating working set size */
|
||||
initSHLL(&lfc_ctl->wss_estimation);
|
||||
@@ -339,31 +310,14 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
* Shrink cache by throwing away least recently accessed chunks and
|
||||
* returning their space to file system
|
||||
*/
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
FileCacheEntry *hole;
|
||||
uint32 offset = victim->offset;
|
||||
uint32 hash;
|
||||
bool found;
|
||||
BufferTag holetag;
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
CriticalAssert(victim->access_count == 0);
|
||||
Assert(victim->access_count == 0);
|
||||
#ifdef FALLOC_FL_PUNCH_HOLE
|
||||
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
|
||||
neon_log(LOG, "Failed to punch hole in file: %m");
|
||||
#endif
|
||||
/* We remove the old entry, and re-enter a hole to the hash table */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
|
||||
memset(&holetag, 0, sizeof(holetag));
|
||||
holetag.blockNum = offset;
|
||||
hash = get_hash_value(lfc_hash, &holetag);
|
||||
hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
|
||||
hole->hash = hash;
|
||||
hole->offset = offset;
|
||||
hole->access_count = 0;
|
||||
CriticalAssert(!found);
|
||||
dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
|
||||
|
||||
lfc_ctl->used -= 1;
|
||||
}
|
||||
lfc_ctl->limit = new_size;
|
||||
@@ -455,8 +409,6 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
@@ -488,7 +440,6 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1));
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -519,7 +470,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
{
|
||||
bool has_remaining_pages;
|
||||
|
||||
for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
|
||||
for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
|
||||
{
|
||||
if (entry->bitmap[i] != 0)
|
||||
{
|
||||
@@ -534,8 +485,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
*/
|
||||
if (!has_remaining_pages)
|
||||
{
|
||||
dlist_delete(&entry->list_node);
|
||||
dlist_push_head(&lfc_ctl->lru, &entry->list_node);
|
||||
dlist_delete(&entry->lru_node);
|
||||
dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -574,8 +525,6 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -602,7 +551,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
/* Unlink entry from LRU list to pin it for the duration of IO operation */
|
||||
if (entry->access_count++ == 0)
|
||||
dlist_delete(&entry->list_node);
|
||||
dlist_delete(&entry->lru_node);
|
||||
generation = lfc_ctl->generation;
|
||||
entry_offset = entry->offset;
|
||||
|
||||
@@ -620,12 +569,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
if (lfc_ctl->generation == generation)
|
||||
{
|
||||
CriticalAssert(LFC_ENABLED());
|
||||
Assert(LFC_ENABLED());
|
||||
lfc_ctl->hits += 1;
|
||||
pgBufferUsage.file_cache.hits += 1;
|
||||
CriticalAssert(entry->access_count > 0);
|
||||
Assert(entry->access_count > 0);
|
||||
if (--entry->access_count == 0)
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
|
||||
}
|
||||
else
|
||||
result = false;
|
||||
@@ -664,8 +613,6 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -685,7 +632,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
* operation
|
||||
*/
|
||||
if (entry->access_count++ == 0)
|
||||
dlist_delete(&entry->list_node);
|
||||
dlist_delete(&entry->lru_node);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -708,26 +655,13 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
CriticalAssert(victim->access_count == 0);
|
||||
Assert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
neon_log(DEBUG2, "Swap file cache page");
|
||||
}
|
||||
else if (!dlist_is_empty(&lfc_ctl->holes))
|
||||
{
|
||||
/* We can reuse a hole that was left behind when the LFC was shrunk previously */
|
||||
FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
|
||||
uint32 offset = hole->offset;
|
||||
bool found;
|
||||
|
||||
hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
|
||||
CriticalAssert(found);
|
||||
|
||||
lfc_ctl->used += 1;
|
||||
entry->offset = offset; /* reuse the hole */
|
||||
}
|
||||
else
|
||||
{
|
||||
lfc_ctl->used += 1;
|
||||
@@ -755,11 +689,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
|
||||
if (lfc_ctl->generation == generation)
|
||||
{
|
||||
CriticalAssert(LFC_ENABLED());
|
||||
Assert(LFC_ENABLED());
|
||||
/* Place entry to the head of LRU list */
|
||||
CriticalAssert(entry->access_count > 0);
|
||||
Assert(entry->access_count > 0);
|
||||
if (--entry->access_count == 0)
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
|
||||
|
||||
entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
|
||||
}
|
||||
@@ -774,6 +708,7 @@ typedef struct
|
||||
} NeonGetStatsCtx;
|
||||
|
||||
#define NUM_NEON_GET_STATS_COLS 2
|
||||
#define NUM_NEON_GET_STATS_ROWS 3
|
||||
|
||||
PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
|
||||
Datum
|
||||
@@ -809,6 +744,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
INT8OID, -1, 0);
|
||||
|
||||
fctx->tupdesc = BlessTupleDesc(tupledesc);
|
||||
funcctx->max_calls = NUM_NEON_GET_STATS_ROWS;
|
||||
funcctx->user_fctx = fctx;
|
||||
|
||||
/* Return to original context when allocating transient memory */
|
||||
@@ -842,11 +778,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->writes;
|
||||
break;
|
||||
case 4:
|
||||
key = "file_cache_size";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->size;
|
||||
break;
|
||||
default:
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
@@ -970,7 +901,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
|
||||
n_pages += pg_popcount32(entry->bitmap[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,10 +54,6 @@
|
||||
|
||||
#define BufTagGetNRelFileInfo(tag) tag.rnode
|
||||
|
||||
#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)
|
||||
|
||||
#define InvalidRelFileNumber InvalidOid
|
||||
|
||||
#define SMgrRelGetRelInfo(reln) \
|
||||
(reln->smgr_rnode.node)
|
||||
|
||||
|
||||
@@ -151,21 +151,34 @@ impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
|
||||
#[derive(Clone)]
|
||||
pub struct CancelClosure {
|
||||
socket_addr: SocketAddr,
|
||||
cancel_token: CancelToken,
|
||||
cancel_token: Option<CancelToken>,
|
||||
}
|
||||
|
||||
impl CancelClosure {
|
||||
pub fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
|
||||
Self {
|
||||
socket_addr,
|
||||
cancel_token,
|
||||
cancel_token: Some(cancel_token),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn test() -> Self {
|
||||
use std::net::{Ipv4Addr, SocketAddrV4};
|
||||
|
||||
Self {
|
||||
socket_addr: SocketAddr::V4(SocketAddrV4::new(Ipv4Addr::from_bits(0), 0)),
|
||||
cancel_token: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancels the query running on user's compute node.
|
||||
pub async fn try_cancel_query(self) -> Result<(), CancelError> {
|
||||
let socket = TcpStream::connect(self.socket_addr).await?;
|
||||
self.cancel_token.cancel_query_raw(socket, NoTls).await?;
|
||||
info!("query was cancelled");
|
||||
if let Some(cancel_token) = self.cancel_token {
|
||||
let socket = TcpStream::connect(self.socket_addr).await?;
|
||||
cancel_token.cancel_query_raw(socket, NoTls).await?;
|
||||
info!("query was cancelled");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,10 @@ use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError}
|
||||
use std::{io, net::SocketAddr, sync::Arc, time::Duration};
|
||||
use thiserror::Error;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_postgres::tls::MakeTlsConnect;
|
||||
use tokio_postgres_rustls::MakeRustlsConnect;
|
||||
use tokio_postgres::{
|
||||
tls::{MakeTlsConnect, NoTlsError},
|
||||
Client, Connection,
|
||||
};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
|
||||
@@ -42,6 +44,12 @@ pub enum ConnectionError {
|
||||
TooManyConnectionAttempts(#[from] ApiLockError),
|
||||
}
|
||||
|
||||
impl From<NoTlsError> for ConnectionError {
|
||||
fn from(value: NoTlsError) -> Self {
|
||||
Self::CouldNotConnect(io::Error::new(io::ErrorKind::Other, value.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for ConnectionError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use ConnectionError::*;
|
||||
@@ -273,6 +281,30 @@ pub struct PostgresConnection {
|
||||
}
|
||||
|
||||
impl ConnCfg {
|
||||
/// Connect to a corresponding compute node.
|
||||
pub async fn managed_connect<M: MakeTlsConnect<tokio::net::TcpStream>>(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
timeout: Duration,
|
||||
mktls: &mut M,
|
||||
) -> Result<(SocketAddr, Client, Connection<TcpStream, M::Stream>), ConnectionError>
|
||||
where
|
||||
ConnectionError: From<M::Error>,
|
||||
{
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
|
||||
drop(pause);
|
||||
|
||||
let tls = mktls.make_tls_connect(host)?;
|
||||
|
||||
// connect_raw() will not use TLS if sslmode is "disable"
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let (client, connection) = self.0.connect_raw(stream, tls).await?;
|
||||
drop(pause);
|
||||
|
||||
Ok((socket_addr, client, connection))
|
||||
}
|
||||
|
||||
/// Connect to a corresponding compute node.
|
||||
pub async fn connect(
|
||||
&self,
|
||||
@@ -281,10 +313,6 @@ impl ConnCfg {
|
||||
aux: MetricsAuxInfo,
|
||||
timeout: Duration,
|
||||
) -> Result<PostgresConnection, ConnectionError> {
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
|
||||
drop(pause);
|
||||
|
||||
let client_config = if allow_self_signed_compute {
|
||||
// Allow all certificates for creating the connection
|
||||
let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
|
||||
@@ -298,21 +326,15 @@ impl ConnCfg {
|
||||
let client_config = client_config.with_no_client_auth();
|
||||
|
||||
let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
|
||||
let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
|
||||
&mut mk_tls,
|
||||
host,
|
||||
)?;
|
||||
|
||||
// connect_raw() will not use TLS if sslmode is "disable"
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let (client, connection) = self.0.connect_raw(stream, tls).await?;
|
||||
drop(pause);
|
||||
let (socket_addr, client, connection) =
|
||||
self.managed_connect(ctx, timeout, &mut mk_tls).await?;
|
||||
tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
|
||||
let stream = connection.stream.into_inner();
|
||||
|
||||
info!(
|
||||
cold_start_info = ctx.cold_start_info().as_str(),
|
||||
"connected to compute node at {host} ({socket_addr}) sslmode={:?}",
|
||||
"connected to compute node ({socket_addr}) sslmode={:?}",
|
||||
self.0.get_ssl_mode()
|
||||
);
|
||||
|
||||
|
||||
@@ -5,7 +5,8 @@ use tracing::{field::display, info};
|
||||
|
||||
use crate::{
|
||||
auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
|
||||
compute,
|
||||
cancellation::CancelClosure,
|
||||
compute::{self, ConnectionError},
|
||||
config::{AuthenticationConfig, ProxyConfig},
|
||||
console::{
|
||||
errors::{GetAuthInfoError, WakeComputeError},
|
||||
@@ -142,7 +143,7 @@ pub enum HttpConnError {
|
||||
#[error("pooled connection closed at inconsistent state")]
|
||||
ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
|
||||
#[error("could not connection to compute")]
|
||||
ConnectionError(#[from] tokio_postgres::Error),
|
||||
ConnectionError(#[from] ConnectionError),
|
||||
|
||||
#[error("could not get auth info")]
|
||||
GetAuthInfo(#[from] GetAuthInfoError),
|
||||
@@ -229,17 +230,16 @@ impl ConnectMechanism for TokioMechanism {
|
||||
let host = node_info.config.get_host()?;
|
||||
let permit = self.locks.get_permit(&host).await?;
|
||||
|
||||
let mut config = (*node_info.config).clone();
|
||||
let config = config
|
||||
.user(&self.conn_info.user_info.user)
|
||||
.password(&*self.conn_info.password)
|
||||
.dbname(&self.conn_info.dbname)
|
||||
.connect_timeout(timeout);
|
||||
let (socket_addr, client, connection) = permit.release_result(
|
||||
node_info
|
||||
.config
|
||||
.managed_connect(ctx, timeout, &mut tokio_postgres::NoTls)
|
||||
.await,
|
||||
)?;
|
||||
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let res = config.connect(tokio_postgres::NoTls).await;
|
||||
drop(pause);
|
||||
let (client, connection) = permit.release_result(res)?;
|
||||
// NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
|
||||
// Yet another reason to rework the connection establishing code.
|
||||
let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
|
||||
|
||||
tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
|
||||
Ok(poll_client(
|
||||
@@ -250,8 +250,14 @@ impl ConnectMechanism for TokioMechanism {
|
||||
connection,
|
||||
self.conn_id,
|
||||
node_info.aux.clone(),
|
||||
cancel_closure,
|
||||
))
|
||||
}
|
||||
|
||||
fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
|
||||
fn update_connect_config(&self, config: &mut compute::ConnCfg) {
|
||||
config
|
||||
.user(&self.conn_info.user_info.user)
|
||||
.dbname(&self.conn_info.dbname)
|
||||
.password(&self.conn_info.password);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,11 +12,13 @@ use std::{
|
||||
ops::Deref,
|
||||
sync::atomic::{self, AtomicUsize},
|
||||
};
|
||||
use tokio::net::TcpStream;
|
||||
use tokio::time::Instant;
|
||||
use tokio_postgres::tls::NoTlsStream;
|
||||
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
|
||||
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::cancellation::CancelClosure;
|
||||
use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
|
||||
use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
|
||||
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
|
||||
@@ -463,14 +465,16 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn poll_client<C: ClientInnerExt>(
|
||||
global_pool: Arc<GlobalConnPool<C>>,
|
||||
ctx: &RequestMonitoring,
|
||||
conn_info: ConnInfo,
|
||||
client: C,
|
||||
mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
|
||||
mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
|
||||
conn_id: uuid::Uuid,
|
||||
aux: MetricsAuxInfo,
|
||||
cancel_closure: CancelClosure,
|
||||
) -> Client<C> {
|
||||
let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
|
||||
let mut session_id = ctx.session_id();
|
||||
@@ -572,6 +576,7 @@ pub fn poll_client<C: ClientInnerExt>(
|
||||
cancel,
|
||||
aux,
|
||||
conn_id,
|
||||
cancel_closure,
|
||||
};
|
||||
Client::new(inner, conn_info, pool_clone)
|
||||
}
|
||||
@@ -582,6 +587,7 @@ struct ClientInner<C: ClientInnerExt> {
|
||||
cancel: CancellationToken,
|
||||
aux: MetricsAuxInfo,
|
||||
conn_id: uuid::Uuid,
|
||||
cancel_closure: CancelClosure,
|
||||
}
|
||||
|
||||
impl<C: ClientInnerExt> Drop for ClientInner<C> {
|
||||
@@ -646,7 +652,7 @@ impl<C: ClientInnerExt> Client<C> {
|
||||
pool,
|
||||
}
|
||||
}
|
||||
pub fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
|
||||
pub fn inner(&mut self) -> (&mut C, &CancelClosure, Discard<'_, C>) {
|
||||
let Self {
|
||||
inner,
|
||||
pool,
|
||||
@@ -654,7 +660,11 @@ impl<C: ClientInnerExt> Client<C> {
|
||||
span: _,
|
||||
} = self;
|
||||
let inner = inner.as_mut().expect("client inner should not be removed");
|
||||
(&mut inner.inner, Discard { pool, conn_info })
|
||||
(
|
||||
&mut inner.inner,
|
||||
&inner.cancel_closure,
|
||||
Discard { pool, conn_info },
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -751,6 +761,7 @@ mod tests {
|
||||
cold_start_info: crate::console::messages::ColdStartInfo::Warm,
|
||||
},
|
||||
conn_id: uuid::Uuid::new_v4(),
|
||||
cancel_closure: CancelClosure::test(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -785,7 +796,7 @@ mod tests {
|
||||
{
|
||||
let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
|
||||
assert_eq!(0, pool.get_global_connections_count());
|
||||
client.inner().1.discard();
|
||||
client.inner().2.discard();
|
||||
// Discard should not add the connection from the pool.
|
||||
assert_eq!(0, pool.get_global_connections_count());
|
||||
}
|
||||
|
||||
@@ -26,7 +26,6 @@ use tokio_postgres::error::ErrorPosition;
|
||||
use tokio_postgres::error::SqlState;
|
||||
use tokio_postgres::GenericClient;
|
||||
use tokio_postgres::IsolationLevel;
|
||||
use tokio_postgres::NoTls;
|
||||
use tokio_postgres::ReadyForQueryStatus;
|
||||
use tokio_postgres::Transaction;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -261,7 +260,9 @@ pub async fn handle(
|
||||
|
||||
let mut message = e.to_string_client();
|
||||
let db_error = match &e {
|
||||
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
|
||||
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(
|
||||
crate::compute::ConnectionError::Postgres(e),
|
||||
))
|
||||
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
|
||||
_ => None,
|
||||
};
|
||||
@@ -622,8 +623,7 @@ impl QueryData {
|
||||
client: &mut Client<tokio_postgres::Client>,
|
||||
parsed_headers: HttpHeaders,
|
||||
) -> Result<String, SqlOverHttpError> {
|
||||
let (inner, mut discard) = client.inner();
|
||||
let cancel_token = inner.cancel_token();
|
||||
let (inner, cancel_token, mut discard) = client.inner();
|
||||
|
||||
let res = match select(
|
||||
pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
|
||||
@@ -647,7 +647,7 @@ impl QueryData {
|
||||
// The query was cancelled.
|
||||
Either::Right((_cancelled, query)) => {
|
||||
tracing::info!("cancelling query");
|
||||
if let Err(err) = cancel_token.cancel_query(NoTls).await {
|
||||
if let Err(err) = cancel_token.clone().try_cancel_query().await {
|
||||
tracing::error!(?err, "could not cancel query");
|
||||
}
|
||||
// wait for the query cancellation
|
||||
@@ -663,7 +663,9 @@ impl QueryData {
|
||||
// query failed or was cancelled.
|
||||
Ok(Err(error)) => {
|
||||
let db_error = match &error {
|
||||
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
|
||||
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(
|
||||
crate::compute::ConnectionError::Postgres(e),
|
||||
))
|
||||
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
|
||||
_ => None,
|
||||
};
|
||||
@@ -694,8 +696,7 @@ impl BatchQueryData {
|
||||
parsed_headers: HttpHeaders,
|
||||
) -> Result<String, SqlOverHttpError> {
|
||||
info!("starting transaction");
|
||||
let (inner, mut discard) = client.inner();
|
||||
let cancel_token = inner.cancel_token();
|
||||
let (inner, cancel_token, mut discard) = client.inner();
|
||||
let mut builder = inner.build_transaction();
|
||||
if let Some(isolation_level) = parsed_headers.txn_isolation_level {
|
||||
builder = builder.isolation_level(isolation_level);
|
||||
@@ -728,7 +729,7 @@ impl BatchQueryData {
|
||||
json_output
|
||||
}
|
||||
Err(SqlOverHttpError::Cancelled(_)) => {
|
||||
if let Err(err) = cancel_token.cancel_query(NoTls).await {
|
||||
if let Err(err) = cancel_token.clone().try_cancel_query().await {
|
||||
tracing::error!(?err, "could not cancel query");
|
||||
}
|
||||
// TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
|
||||
|
||||
@@ -92,7 +92,7 @@ impl TermHistory {
|
||||
}
|
||||
|
||||
/// Find point of divergence between leader (walproposer) term history and
|
||||
/// safekeeper. Arguments are not symmetric as proposer history ends at
|
||||
/// safekeeper. Arguments are not symmetrics as proposer history ends at
|
||||
/// +infinity while safekeeper at flush_lsn.
|
||||
/// C version is at walproposer SendProposerElected.
|
||||
pub fn find_highest_common_point(
|
||||
@@ -701,13 +701,7 @@ where
|
||||
.with_label_values(&["handle_elected"])
|
||||
.start_timer();
|
||||
|
||||
info!(
|
||||
"received ProposerElected {:?}, term={}, last_log_term={}, flush_lsn={}",
|
||||
msg,
|
||||
self.state.acceptor_state.term,
|
||||
self.get_last_log_term(),
|
||||
self.flush_lsn()
|
||||
);
|
||||
info!("received ProposerElected {:?}", msg);
|
||||
if self.state.acceptor_state.term < msg.term {
|
||||
let mut state = self.state.start_change();
|
||||
state.acceptor_state.term = msg.term;
|
||||
@@ -719,43 +713,22 @@ where
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Before truncating WAL check-cross the check divergence point received
|
||||
// from the walproposer.
|
||||
let sk_th = self.get_term_history();
|
||||
let last_common_point = match TermHistory::find_highest_common_point(
|
||||
&msg.term_history,
|
||||
&sk_th,
|
||||
self.flush_lsn(),
|
||||
) {
|
||||
// No common point. Expect streaming from the beginning of the
|
||||
// history like walproposer while we don't have proper init.
|
||||
None => *msg.term_history.0.first().ok_or(anyhow::anyhow!(
|
||||
"empty walproposer term history {:?}",
|
||||
msg.term_history
|
||||
))?,
|
||||
Some(lcp) => lcp,
|
||||
};
|
||||
// This is expected to happen in a rare race when another connection
|
||||
// from the same walproposer writes + flushes WAL after this connection
|
||||
// sent flush_lsn in VoteRequest; for instance, very late
|
||||
// ProposerElected message delivery after another connection was
|
||||
// established and wrote WAL. In such cases error is transient;
|
||||
// reconnection makes safekeeper send newest term history and flush_lsn
|
||||
// and walproposer recalculates the streaming point. OTOH repeating
|
||||
// error indicates a serious bug.
|
||||
if last_common_point.lsn != msg.start_streaming_at {
|
||||
bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
|
||||
last_common_point, msg.start_streaming_at,
|
||||
self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
|
||||
);
|
||||
// This might happen in a rare race when another (old) connection from
|
||||
// the same walproposer writes + flushes WAL after this connection
|
||||
// already sent flush_lsn in VoteRequest. It is generally safe to
|
||||
// proceed, but to prevent commit_lsn surprisingly going down we should
|
||||
// either refuse the session (simpler) or skip the part we already have
|
||||
// from the stream (can be implemented).
|
||||
if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
|
||||
bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
|
||||
msg.term, self.flush_lsn(), msg.start_streaming_at)
|
||||
}
|
||||
|
||||
// We are also expected to never attempt to truncate committed data.
|
||||
// Otherwise we must never attempt to truncate committed data.
|
||||
assert!(
|
||||
msg.start_streaming_at >= self.state.inmem.commit_lsn,
|
||||
"attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
|
||||
msg.start_streaming_at, self.state.inmem.commit_lsn,
|
||||
self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
|
||||
"attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
|
||||
msg.start_streaming_at,
|
||||
self.state.inmem.commit_lsn
|
||||
);
|
||||
|
||||
// Before first WAL write initialize its segment. It makes first segment
|
||||
@@ -770,6 +743,9 @@ where
|
||||
.await?;
|
||||
}
|
||||
|
||||
// TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
|
||||
// intersection of our history and history from msg
|
||||
|
||||
// truncate wal, update the LSNs
|
||||
self.wal_store.truncate_wal(msg.start_streaming_at).await?;
|
||||
|
||||
@@ -1093,7 +1069,7 @@ mod tests {
|
||||
|
||||
let pem = ProposerElected {
|
||||
term: 1,
|
||||
start_streaming_at: Lsn(3),
|
||||
start_streaming_at: Lsn(1),
|
||||
term_history: TermHistory(vec![TermLsn {
|
||||
term: 1,
|
||||
lsn: Lsn(3),
|
||||
|
||||
@@ -520,19 +520,6 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
|
||||
json_response(StatusCode::OK, node_status)
|
||||
}
|
||||
|
||||
async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let leader = state.service.get_leader().await.map_err(|err| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Failed to read leader from database: {err}"
|
||||
))
|
||||
})?;
|
||||
|
||||
json_response(StatusCode::OK, leader)
|
||||
}
|
||||
|
||||
async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -1029,9 +1016,6 @@ pub fn make_router(
|
||||
.get("/control/v1/node/:node_id", |r| {
|
||||
named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
|
||||
})
|
||||
.get("/control/v1/leader", |r| {
|
||||
named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
|
||||
})
|
||||
.put("/control/v1/node/:node_id/drain", |r| {
|
||||
named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
|
||||
})
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::tenant_shard::ObservedState;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
use std::collections::HashMap;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use hyper::Uri;
|
||||
@@ -69,8 +69,6 @@ impl PeerClient {
|
||||
req
|
||||
};
|
||||
|
||||
let req = req.timeout(Duration::from_secs(2));
|
||||
|
||||
let res = req
|
||||
.send()
|
||||
.await
|
||||
|
||||
@@ -20,8 +20,7 @@ use crate::{
|
||||
metrics,
|
||||
peer_client::{GlobalObservedState, PeerClient},
|
||||
persistence::{
|
||||
AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
|
||||
TenantFilter,
|
||||
AbortShardSplitStatus, ControllerPersistence, MetadataHealthPersistence, TenantFilter,
|
||||
},
|
||||
reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
|
||||
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
|
||||
@@ -490,6 +489,11 @@ pub(crate) enum ReconcileResultRequest {
|
||||
Stop,
|
||||
}
|
||||
|
||||
struct LeaderStepDownState {
|
||||
observed: GlobalObservedState,
|
||||
leader: ControllerPersistence,
|
||||
}
|
||||
|
||||
impl Service {
|
||||
pub fn get_config(&self) -> &Config {
|
||||
&self.config
|
||||
@@ -500,8 +504,7 @@ impl Service {
|
||||
#[instrument(skip_all)]
|
||||
async fn startup_reconcile(
|
||||
self: &Arc<Service>,
|
||||
current_leader: Option<ControllerPersistence>,
|
||||
leader_step_down_state: Option<GlobalObservedState>,
|
||||
leader_step_down_state: Option<LeaderStepDownState>,
|
||||
bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
|
||||
Result<(), (TenantShardId, NotifyError)>,
|
||||
>,
|
||||
@@ -519,15 +522,17 @@ impl Service {
|
||||
.checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
|
||||
.expect("Reconcile timeout is a modest constant");
|
||||
|
||||
let observed = if let Some(state) = leader_step_down_state {
|
||||
let (observed, current_leader) = if let Some(state) = leader_step_down_state {
|
||||
tracing::info!(
|
||||
"Using observed state received from leader at {}",
|
||||
current_leader.as_ref().unwrap().address
|
||||
state.leader.address,
|
||||
);
|
||||
|
||||
state
|
||||
(state.observed, Some(state.leader))
|
||||
} else {
|
||||
self.build_global_observed_state(node_scan_deadline).await
|
||||
(
|
||||
self.build_global_observed_state(node_scan_deadline).await,
|
||||
None,
|
||||
)
|
||||
};
|
||||
|
||||
// Accumulate a list of any tenant locations that ought to be detached
|
||||
@@ -1377,32 +1382,13 @@ impl Service {
|
||||
};
|
||||
|
||||
let leadership_status = this.inner.read().unwrap().get_leadership_status();
|
||||
let leader = match this.get_leader().await {
|
||||
Ok(ok) => ok,
|
||||
Err(err) => {
|
||||
tracing::error!(
|
||||
"Failed to query database for current leader: {err}. Aborting start-up ..."
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
let leader_step_down_state = match leadership_status {
|
||||
LeadershipStatus::Candidate => {
|
||||
if let Some(ref leader) = leader {
|
||||
this.request_step_down(leader).await
|
||||
} else {
|
||||
tracing::info!(
|
||||
"No leader found to request step down from. Will build observed state."
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
let peer_observed_state = match leadership_status {
|
||||
LeadershipStatus::Candidate => this.request_step_down().await,
|
||||
LeadershipStatus::Leader => None,
|
||||
LeadershipStatus::SteppedDown => unreachable!(),
|
||||
};
|
||||
|
||||
this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
|
||||
this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
|
||||
.await;
|
||||
|
||||
drop(startup_completion);
|
||||
@@ -4664,10 +4650,6 @@ impl Service {
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
|
||||
self.persistence.get_leader().await
|
||||
}
|
||||
|
||||
pub(crate) async fn node_register(
|
||||
&self,
|
||||
register_req: NodeRegisterRequest,
|
||||
@@ -6360,7 +6342,6 @@ impl Service {
|
||||
|
||||
pub(crate) async fn step_down(&self) -> GlobalObservedState {
|
||||
tracing::info!("Received step down request from peer");
|
||||
failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");
|
||||
|
||||
self.inner.write().unwrap().step_down();
|
||||
// TODO: would it make sense to have a time-out for this?
|
||||
@@ -6386,31 +6367,50 @@ impl Service {
|
||||
///
|
||||
/// On failures to query the database or step down error responses the process is killed
|
||||
/// and we rely on k8s to retry.
|
||||
async fn request_step_down(
|
||||
&self,
|
||||
leader: &ControllerPersistence,
|
||||
) -> Option<GlobalObservedState> {
|
||||
tracing::info!("Sending step down request to {leader:?}");
|
||||
|
||||
// TODO: jwt token
|
||||
let client = PeerClient::new(
|
||||
Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
|
||||
self.config.jwt_token.clone(),
|
||||
);
|
||||
let state = client.step_down(&self.cancel).await;
|
||||
match state {
|
||||
Ok(state) => Some(state),
|
||||
async fn request_step_down(&self) -> Option<LeaderStepDownState> {
|
||||
let leader = match self.persistence.get_leader().await {
|
||||
Ok(leader) => leader,
|
||||
Err(err) => {
|
||||
// TODO: Make leaders periodically update a timestamp field in the
|
||||
// database and, if the leader is not reachable from the current instance,
|
||||
// but inferred as alive from the timestamp, abort start-up. This avoids
|
||||
// a potential scenario in which we have two controllers acting as leaders.
|
||||
tracing::error!(
|
||||
"Leader ({}) did not respond to step-down request: {}",
|
||||
leader.address,
|
||||
err
|
||||
"Failed to query database for current leader: {err}. Aborting start-up ..."
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
match leader {
|
||||
Some(leader) => {
|
||||
tracing::info!("Sending step down request to {leader:?}");
|
||||
|
||||
// TODO: jwt token
|
||||
let client = PeerClient::new(
|
||||
Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
|
||||
self.config.jwt_token.clone(),
|
||||
);
|
||||
let state = client.step_down(&self.cancel).await;
|
||||
match state {
|
||||
Ok(state) => Some(LeaderStepDownState {
|
||||
observed: state,
|
||||
leader: leader.clone(),
|
||||
}),
|
||||
Err(err) => {
|
||||
// TODO: Make leaders periodically update a timestamp field in the
|
||||
// database and, if the leader is not reachable from the current instance,
|
||||
// but inferred as alive from the timestamp, abort start-up. This avoids
|
||||
// a potential scenario in which we have two controllers acting as leaders.
|
||||
tracing::error!(
|
||||
"Leader ({}) did not respond to step-down request: {}",
|
||||
leader.address,
|
||||
err
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
tracing::info!(
|
||||
"No leader found to request step down from. Will build observed state."
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,10 +3,9 @@ use camino::Utf8PathBuf;
|
||||
use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use reqwest::{Method, Url};
|
||||
use storage_controller_client::control_api;
|
||||
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
||||
use storage_scrubber::pageserver_physical_gc::GcMode;
|
||||
use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata;
|
||||
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
|
||||
use storage_scrubber::tenant_snapshot::SnapshotDownloader;
|
||||
use storage_scrubber::{find_large_objects, ControllerClientConfig};
|
||||
use storage_scrubber::{
|
||||
@@ -69,7 +68,7 @@ enum Command {
|
||||
#[arg(long = "tenant-id", num_args = 0..)]
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
#[arg(long = "post", default_value_t = false)]
|
||||
post_to_storcon: bool,
|
||||
post_to_storage_controller: bool,
|
||||
#[arg(long, default_value = None)]
|
||||
/// For safekeeper node_kind only, points to db with debug dump
|
||||
dump_db_connstr: Option<String>,
|
||||
@@ -101,16 +100,6 @@ enum Command {
|
||||
#[arg(long = "concurrency", short = 'j', default_value_t = 64)]
|
||||
concurrency: usize,
|
||||
},
|
||||
CronJob {
|
||||
// PageserverPhysicalGc
|
||||
#[arg(long = "min-age")]
|
||||
gc_min_age: humantime::Duration,
|
||||
#[arg(short, long, default_value_t = GcMode::IndicesOnly)]
|
||||
gc_mode: GcMode,
|
||||
// ScanMetadata
|
||||
#[arg(long = "post", default_value_t = false)]
|
||||
post_to_storcon: bool,
|
||||
},
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@@ -128,7 +117,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
Command::TenantSnapshot { .. } => "tenant-snapshot",
|
||||
Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
|
||||
Command::FindLargeObjects { .. } => "find-large-objects",
|
||||
Command::CronJob { .. } => "cron-job",
|
||||
};
|
||||
let _guard = init_logging(&format!(
|
||||
"{}_{}_{}_{}.log",
|
||||
@@ -138,13 +126,12 @@ async fn main() -> anyhow::Result<()> {
|
||||
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
|
||||
));
|
||||
|
||||
let controller_client = cli.controller_api.map(|controller_api| {
|
||||
let controller_client_conf = cli.controller_api.map(|controller_api| {
|
||||
ControllerClientConfig {
|
||||
controller_api,
|
||||
// Default to no key: this is a convenience when working in a development environment
|
||||
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
|
||||
}
|
||||
.build_client()
|
||||
});
|
||||
|
||||
match cli.command {
|
||||
@@ -152,7 +139,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
json,
|
||||
tenant_ids,
|
||||
node_kind,
|
||||
post_to_storcon,
|
||||
post_to_storage_controller,
|
||||
dump_db_connstr,
|
||||
dump_db_table,
|
||||
} => {
|
||||
@@ -191,14 +178,53 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
scan_pageserver_metadata_cmd(
|
||||
bucket_config,
|
||||
controller_client.as_ref(),
|
||||
tenant_ids,
|
||||
json,
|
||||
post_to_storcon,
|
||||
)
|
||||
.await
|
||||
if controller_client_conf.is_none() && post_to_storage_controller {
|
||||
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
|
||||
}
|
||||
match scan_metadata(bucket_config.clone(), tenant_ids).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
Err(e)
|
||||
}
|
||||
Ok(summary) => {
|
||||
if json {
|
||||
println!("{}", serde_json::to_string(&summary).unwrap())
|
||||
} else {
|
||||
println!("{}", summary.summary_string());
|
||||
}
|
||||
|
||||
if post_to_storage_controller {
|
||||
if let Some(conf) = controller_client_conf {
|
||||
let controller_client = conf.build_client();
|
||||
let body = summary.build_health_update_request();
|
||||
controller_client
|
||||
.dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
|
||||
Method::POST,
|
||||
"control/v1/metadata_health/update".to_string(),
|
||||
Some(body),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
if summary.is_fatal() {
|
||||
tracing::error!("Fatal scrub errors detected");
|
||||
} else if summary.is_empty() {
|
||||
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
|
||||
// scrubber they were likely expecting to scan something, and if we see no timelines
|
||||
// at all then it's likely due to some configuration issues like a bad prefix
|
||||
tracing::error!(
|
||||
"No timelines found in bucket {} prefix {}",
|
||||
bucket_config.bucket,
|
||||
bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("<none>".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Command::FindGarbage {
|
||||
@@ -228,14 +254,31 @@ async fn main() -> anyhow::Result<()> {
|
||||
min_age,
|
||||
mode,
|
||||
} => {
|
||||
pageserver_physical_gc_cmd(
|
||||
&bucket_config,
|
||||
controller_client.as_ref(),
|
||||
match (&controller_client_conf, mode) {
|
||||
(Some(_), _) => {
|
||||
// Any mode may run when controller API is set
|
||||
}
|
||||
(None, GcMode::Full) => {
|
||||
// The part of physical GC where we erase ancestor layers cannot be done safely without
|
||||
// confirming the most recent complete shard split with the controller. Refuse to run, rather
|
||||
// than doing it unsafely.
|
||||
return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run"));
|
||||
}
|
||||
(None, GcMode::DryRun | GcMode::IndicesOnly) => {
|
||||
// These GcModes do not require the controller to run.
|
||||
}
|
||||
}
|
||||
|
||||
let summary = pageserver_physical_gc(
|
||||
bucket_config,
|
||||
controller_client_conf,
|
||||
tenant_ids,
|
||||
min_age,
|
||||
min_age.into(),
|
||||
mode,
|
||||
)
|
||||
.await
|
||||
.await?;
|
||||
println!("{}", serde_json::to_string(&summary).unwrap());
|
||||
Ok(())
|
||||
}
|
||||
Command::FindLargeObjects {
|
||||
min_size,
|
||||
@@ -252,142 +295,5 @@ async fn main() -> anyhow::Result<()> {
|
||||
println!("{}", serde_json::to_string(&summary).unwrap());
|
||||
Ok(())
|
||||
}
|
||||
Command::CronJob {
|
||||
gc_min_age,
|
||||
gc_mode,
|
||||
post_to_storcon,
|
||||
} => {
|
||||
run_cron_job(
|
||||
bucket_config,
|
||||
controller_client.as_ref(),
|
||||
gc_min_age,
|
||||
gc_mode,
|
||||
post_to_storcon,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Runs the scrubber cron job.
|
||||
/// 1. Do pageserver physical gc
|
||||
/// 2. Scan pageserver metadata
|
||||
pub async fn run_cron_job(
|
||||
bucket_config: BucketConfig,
|
||||
controller_client: Option<&control_api::Client>,
|
||||
gc_min_age: humantime::Duration,
|
||||
gc_mode: GcMode,
|
||||
post_to_storcon: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc");
|
||||
pageserver_physical_gc_cmd(
|
||||
&bucket_config,
|
||||
controller_client,
|
||||
Vec::new(),
|
||||
gc_min_age,
|
||||
gc_mode,
|
||||
)
|
||||
.await?;
|
||||
tracing::info!(%post_to_storcon, node_kind = %NodeKind::Pageserver, "Running scan-metadata");
|
||||
scan_pageserver_metadata_cmd(
|
||||
bucket_config,
|
||||
controller_client,
|
||||
Vec::new(),
|
||||
true,
|
||||
post_to_storcon,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn pageserver_physical_gc_cmd(
|
||||
bucket_config: &BucketConfig,
|
||||
controller_client: Option<&control_api::Client>,
|
||||
tenant_shard_ids: Vec<TenantShardId>,
|
||||
min_age: humantime::Duration,
|
||||
mode: GcMode,
|
||||
) -> anyhow::Result<()> {
|
||||
match (controller_client, mode) {
|
||||
(Some(_), _) => {
|
||||
// Any mode may run when controller API is set
|
||||
}
|
||||
(None, GcMode::Full) => {
|
||||
// The part of physical GC where we erase ancestor layers cannot be done safely without
|
||||
// confirming the most recent complete shard split with the controller. Refuse to run, rather
|
||||
// than doing it unsafely.
|
||||
return Err(anyhow!(
|
||||
"Full physical GC requires `--controller-api` and `--controller-jwt` to run"
|
||||
));
|
||||
}
|
||||
(None, GcMode::DryRun | GcMode::IndicesOnly) => {
|
||||
// These GcModes do not require the controller to run.
|
||||
}
|
||||
}
|
||||
|
||||
let summary = pageserver_physical_gc(
|
||||
bucket_config,
|
||||
controller_client,
|
||||
tenant_shard_ids,
|
||||
min_age.into(),
|
||||
mode,
|
||||
)
|
||||
.await?;
|
||||
println!("{}", serde_json::to_string(&summary).unwrap());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn scan_pageserver_metadata_cmd(
|
||||
bucket_config: BucketConfig,
|
||||
controller_client: Option<&control_api::Client>,
|
||||
tenant_shard_ids: Vec<TenantShardId>,
|
||||
json: bool,
|
||||
post_to_storcon: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
if controller_client.is_none() && post_to_storcon {
|
||||
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
|
||||
}
|
||||
match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
Err(e)
|
||||
}
|
||||
Ok(summary) => {
|
||||
if json {
|
||||
println!("{}", serde_json::to_string(&summary).unwrap())
|
||||
} else {
|
||||
println!("{}", summary.summary_string());
|
||||
}
|
||||
|
||||
if post_to_storcon {
|
||||
if let Some(client) = controller_client {
|
||||
let body = summary.build_health_update_request();
|
||||
client
|
||||
.dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
|
||||
Method::POST,
|
||||
"control/v1/metadata_health/update".to_string(),
|
||||
Some(body),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
if summary.is_fatal() {
|
||||
tracing::error!("Fatal scrub errors detected");
|
||||
} else if summary.is_empty() {
|
||||
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
|
||||
// scrubber they were likely expecting to scan something, and if we see no timelines
|
||||
// at all then it's likely due to some configuration issues like a bad prefix
|
||||
tracing::error!(
|
||||
"No timelines found in bucket {} prefix {}",
|
||||
bucket_config.bucket,
|
||||
bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("<none>".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,9 @@ use std::time::{Duration, SystemTime};
|
||||
|
||||
use crate::checks::{list_timeline_blobs, BlobDataParseResult};
|
||||
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
|
||||
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
|
||||
use crate::{
|
||||
init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId,
|
||||
};
|
||||
use aws_sdk_s3::Client;
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
@@ -471,8 +473,8 @@ async fn gc_ancestor(
|
||||
/// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
|
||||
/// make sure that object listings don't get slowed down by large numbers of garbage objects.
|
||||
pub async fn pageserver_physical_gc(
|
||||
bucket_config: &BucketConfig,
|
||||
controller_client: Option<&control_api::Client>,
|
||||
bucket_config: BucketConfig,
|
||||
controller_client_conf: Option<ControllerClientConfig>,
|
||||
tenant_shard_ids: Vec<TenantShardId>,
|
||||
min_age: Duration,
|
||||
mode: GcMode,
|
||||
@@ -556,7 +558,7 @@ pub async fn pageserver_physical_gc(
|
||||
let timelines = timelines.map_ok(|ttid| {
|
||||
gc_timeline(
|
||||
&s3_client,
|
||||
bucket_config,
|
||||
&bucket_config,
|
||||
&min_age,
|
||||
&target,
|
||||
mode,
|
||||
@@ -572,7 +574,7 @@ pub async fn pageserver_physical_gc(
|
||||
}
|
||||
|
||||
// Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
|
||||
let Some(client) = controller_client else {
|
||||
let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
|
||||
tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
|
||||
return Ok(summary);
|
||||
};
|
||||
@@ -581,13 +583,13 @@ pub async fn pageserver_physical_gc(
|
||||
.unwrap()
|
||||
.into_inner()
|
||||
.unwrap()
|
||||
.into_gc_ancestors(client, &mut summary)
|
||||
.into_gc_ancestors(&controller_client, &mut summary)
|
||||
.await;
|
||||
|
||||
for ancestor_shard in ancestor_shards {
|
||||
gc_ancestor(
|
||||
&s3_client,
|
||||
bucket_config,
|
||||
&bucket_config,
|
||||
&target,
|
||||
&min_age,
|
||||
ancestor_shard,
|
||||
|
||||
@@ -116,7 +116,7 @@ Index versions: {version_summary}
|
||||
}
|
||||
|
||||
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
|
||||
pub async fn scan_pageserver_metadata(
|
||||
pub async fn scan_metadata(
|
||||
bucket_config: BucketConfig,
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
) -> anyhow::Result<MetadataSummary> {
|
||||
|
||||
@@ -3,7 +3,6 @@ pytest_plugins = (
|
||||
"fixtures.parametrize",
|
||||
"fixtures.httpserver",
|
||||
"fixtures.compute_reconfigure",
|
||||
"fixtures.storage_controller_proxy",
|
||||
"fixtures.neon_fixtures",
|
||||
"fixtures.benchmark_fixture",
|
||||
"fixtures.pg_stats",
|
||||
|
||||
@@ -497,7 +497,6 @@ class NeonEnvBuilder:
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore] = None,
|
||||
pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
|
||||
safekeeper_extra_opts: Optional[list[str]] = None,
|
||||
storage_controller_port_override: Optional[int] = None,
|
||||
):
|
||||
self.repo_dir = repo_dir
|
||||
self.rust_log_override = rust_log_override
|
||||
@@ -550,8 +549,6 @@ class NeonEnvBuilder:
|
||||
|
||||
self.safekeeper_extra_opts = safekeeper_extra_opts
|
||||
|
||||
self.storage_controller_port_override = storage_controller_port_override
|
||||
|
||||
assert test_name.startswith(
|
||||
"test_"
|
||||
), "Unexpectedly instantiated from outside a test function"
|
||||
@@ -1057,7 +1054,6 @@ class NeonEnv:
|
||||
"""
|
||||
|
||||
BASE_PAGESERVER_ID = 1
|
||||
storage_controller: NeonStorageController | NeonProxiedStorageController
|
||||
|
||||
def __init__(self, config: NeonEnvBuilder):
|
||||
self.repo_dir = config.repo_dir
|
||||
@@ -1088,41 +1084,27 @@ class NeonEnv:
|
||||
self.initial_tenant = config.initial_tenant
|
||||
self.initial_timeline = config.initial_timeline
|
||||
|
||||
# Find two adjacent ports for storage controller and its postgres DB. This
|
||||
# loop would eventually throw from get_port() if we run out of ports (extremely
|
||||
# unlikely): usually we find two adjacent free ports on the first iteration.
|
||||
while True:
|
||||
self.storage_controller_port = self.port_distributor.get_port()
|
||||
storage_controller_pg_port = self.port_distributor.get_port()
|
||||
if storage_controller_pg_port == self.storage_controller_port + 1:
|
||||
break
|
||||
|
||||
# The URL for the pageserver to use as its control_plane_api config
|
||||
if config.storage_controller_port_override is not None:
|
||||
log.info(
|
||||
f"Using storage controller api override {config.storage_controller_port_override}"
|
||||
)
|
||||
|
||||
self.storage_controller_port = config.storage_controller_port_override
|
||||
self.storage_controller = NeonProxiedStorageController(
|
||||
self, config.storage_controller_port_override, config.auth_enabled
|
||||
)
|
||||
else:
|
||||
# Find two adjacent ports for storage controller and its postgres DB. This
|
||||
# loop would eventually throw from get_port() if we run out of ports (extremely
|
||||
# unlikely): usually we find two adjacent free ports on the first iteration.
|
||||
while True:
|
||||
storage_controller_port = self.port_distributor.get_port()
|
||||
storage_controller_pg_port = self.port_distributor.get_port()
|
||||
if storage_controller_pg_port == storage_controller_port + 1:
|
||||
break
|
||||
|
||||
self.storage_controller_port = storage_controller_port
|
||||
self.storage_controller = NeonStorageController(
|
||||
self, storage_controller_port, config.auth_enabled
|
||||
)
|
||||
|
||||
log.info(
|
||||
f"Using generated control_plane_api: {self.storage_controller.upcall_api_endpoint()}"
|
||||
)
|
||||
|
||||
self.storage_controller_api: str = self.storage_controller.api_root()
|
||||
self.control_plane_api: str = self.storage_controller.upcall_api_endpoint()
|
||||
self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1"
|
||||
# The base URL of the storage controller
|
||||
self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}"
|
||||
|
||||
# For testing this with a fake HTTP server, enable passing through a URL from config
|
||||
self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
|
||||
|
||||
self.storage_controller: NeonStorageController = NeonStorageController(
|
||||
self, config.auth_enabled
|
||||
)
|
||||
|
||||
self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
|
||||
self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
|
||||
|
||||
@@ -1162,6 +1144,7 @@ class NeonEnv:
|
||||
"listen_http_addr": f"localhost:{pageserver_port.http}",
|
||||
"pg_auth_type": pg_auth_type,
|
||||
"http_auth_type": http_auth_type,
|
||||
"image_compression": "zstd",
|
||||
}
|
||||
if self.pageserver_virtual_file_io_engine is not None:
|
||||
ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
|
||||
@@ -1886,24 +1869,16 @@ class NeonCli(AbstractNeonCli):
|
||||
def storage_controller_start(
|
||||
self,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
instance_id: Optional[int] = None,
|
||||
base_port: Optional[int] = None,
|
||||
):
|
||||
cmd = ["storage_controller", "start"]
|
||||
if timeout_in_seconds is not None:
|
||||
cmd.append(f"--start-timeout={timeout_in_seconds}s")
|
||||
if instance_id is not None:
|
||||
cmd.append(f"--instance-id={instance_id}")
|
||||
if base_port is not None:
|
||||
cmd.append(f"--base-port={base_port}")
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None):
|
||||
def storage_controller_stop(self, immediate: bool):
|
||||
cmd = ["storage_controller", "stop"]
|
||||
if immediate:
|
||||
cmd.extend(["-m", "immediate"])
|
||||
if instance_id is not None:
|
||||
cmd.append(f"--instance-id={instance_id}")
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def pageserver_start(
|
||||
@@ -2214,30 +2189,17 @@ class PageserverSchedulingPolicy(str, Enum):
|
||||
PAUSE_FOR_RESTART = "PauseForRestart"
|
||||
|
||||
|
||||
class StorageControllerLeadershipStatus(str, Enum):
|
||||
LEADER = "leader"
|
||||
STEPPED_DOWN = "stepped_down"
|
||||
CANDIDATE = "candidate"
|
||||
|
||||
|
||||
class NeonStorageController(MetricsGetter, LogUtils):
|
||||
def __init__(self, env: NeonEnv, port: int, auth_enabled: bool):
|
||||
def __init__(self, env: NeonEnv, auth_enabled: bool):
|
||||
self.env = env
|
||||
self.port: int = port
|
||||
self.api: str = f"http://127.0.0.1:{port}"
|
||||
self.running = False
|
||||
self.auth_enabled = auth_enabled
|
||||
self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
|
||||
self.logfile = self.env.repo_dir / "storage_controller_1" / "storage_controller.log"
|
||||
self.logfile = self.workdir / "storage_controller.log"
|
||||
|
||||
def start(
|
||||
self,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
instance_id: Optional[int] = None,
|
||||
base_port: Optional[int] = None,
|
||||
):
|
||||
def start(self, timeout_in_seconds: Optional[int] = None):
|
||||
assert not self.running
|
||||
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
|
||||
self.env.neon_cli.storage_controller_start(timeout_in_seconds)
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
@@ -2247,12 +2209,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
def upcall_api_endpoint(self) -> str:
|
||||
return f"{self.api}/upcall/v1"
|
||||
|
||||
def api_root(self) -> str:
|
||||
return self.api
|
||||
|
||||
@staticmethod
|
||||
def retryable_node_operation(op, ps_id, max_attempts, backoff):
|
||||
while max_attempts > 0:
|
||||
@@ -2281,9 +2237,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
|
||||
def assert_no_errors(self):
|
||||
assert_no_errors(
|
||||
self.logfile,
|
||||
"storage_controller",
|
||||
self.allowed_errors,
|
||||
self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
|
||||
)
|
||||
|
||||
def pageserver_api(self) -> PageserverHttpClient:
|
||||
@@ -2295,7 +2249,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
auth_token = None
|
||||
if self.auth_enabled:
|
||||
auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
|
||||
return PageserverHttpClient(self.port, lambda: True, auth_token)
|
||||
return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token)
|
||||
|
||||
def request(self, method, *args, **kwargs) -> requests.Response:
|
||||
resp = requests.request(method, *args, **kwargs)
|
||||
@@ -2312,13 +2266,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
return headers
|
||||
|
||||
def get_metrics(self) -> Metrics:
|
||||
res = self.request("GET", f"{self.api}/metrics")
|
||||
res = self.request("GET", f"{self.env.storage_controller_api}/metrics")
|
||||
return parse_metrics(res.text)
|
||||
|
||||
def ready(self) -> bool:
|
||||
status = None
|
||||
try:
|
||||
resp = self.request("GET", f"{self.api}/ready")
|
||||
resp = self.request("GET", f"{self.env.storage_controller_api}/ready")
|
||||
status = resp.status_code
|
||||
except StorageControllerApiException as e:
|
||||
status = e.status_code
|
||||
@@ -2351,7 +2305,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.api}/debug/v1/attach-hook",
|
||||
f"{self.env.storage_controller_api}/debug/v1/attach-hook",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2362,7 +2316,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.api}/debug/v1/attach-hook",
|
||||
f"{self.env.storage_controller_api}/debug/v1/attach-hook",
|
||||
json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2373,7 +2327,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
"""
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.api}/debug/v1/inspect",
|
||||
f"{self.env.storage_controller_api}/debug/v1/inspect",
|
||||
json={"tenant_shard_id": str(tenant_shard_id)},
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2396,7 +2350,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
log.info(f"node_register({body})")
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.api}/control/v1/node",
|
||||
f"{self.env.storage_controller_api}/control/v1/node",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2405,7 +2359,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
log.info(f"node_delete({node_id})")
|
||||
self.request(
|
||||
"DELETE",
|
||||
f"{self.api}/control/v1/node/{node_id}",
|
||||
f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
@@ -2413,7 +2367,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
log.info(f"node_drain({node_id})")
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/node/{node_id}/drain",
|
||||
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
@@ -2421,7 +2375,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
log.info(f"cancel_node_drain({node_id})")
|
||||
self.request(
|
||||
"DELETE",
|
||||
f"{self.api}/control/v1/node/{node_id}/drain",
|
||||
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
@@ -2429,7 +2383,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
log.info(f"node_fill({node_id})")
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/node/{node_id}/fill",
|
||||
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
@@ -2437,22 +2391,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
log.info(f"cancel_node_fill({node_id})")
|
||||
self.request(
|
||||
"DELETE",
|
||||
f"{self.api}/control/v1/node/{node_id}/fill",
|
||||
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def node_status(self, node_id):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/node/{node_id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def get_leader(self):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/leader",
|
||||
f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
return response.json()
|
||||
@@ -2460,7 +2406,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
def node_list(self):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/node",
|
||||
f"{self.env.storage_controller_api}/control/v1/node",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
return response.json()
|
||||
@@ -2468,7 +2414,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
def tenant_list(self):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/debug/v1/tenant",
|
||||
f"{self.env.storage_controller_api}/debug/v1/tenant",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
return response.json()
|
||||
@@ -2478,7 +2424,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
body["node_id"] = node_id
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/node/{node_id}/config",
|
||||
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2513,7 +2459,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.api}/v1/tenant",
|
||||
f"{self.env.storage_controller_api}/v1/tenant",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.PAGE_SERVER_API),
|
||||
)
|
||||
@@ -2526,7 +2472,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
"""
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/debug/v1/tenant/{tenant_id}/locate",
|
||||
f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
body = response.json()
|
||||
@@ -2539,7 +2485,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
"""
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/tenant/{tenant_id}",
|
||||
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
response.raise_for_status()
|
||||
@@ -2550,7 +2496,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
) -> list[TenantShardId]:
|
||||
response = self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/tenant/{tenant_id}/shard_split",
|
||||
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
|
||||
json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size},
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2562,7 +2508,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate",
|
||||
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate",
|
||||
json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2573,7 +2519,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
log.info(f"tenant_policy_update({tenant_id}, {body})")
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/tenant/{tenant_id}/policy",
|
||||
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2581,14 +2527,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
def tenant_import(self, tenant_id: TenantId):
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.api}/debug/v1/tenant/{tenant_id}/import",
|
||||
f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def reconcile_all(self):
|
||||
r = self.request(
|
||||
"POST",
|
||||
f"{self.api}/debug/v1/reconcile_all",
|
||||
f"{self.env.storage_controller_api}/debug/v1/reconcile_all",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
r.raise_for_status()
|
||||
@@ -2621,7 +2567,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
"""
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.api}/debug/v1/consistency_check",
|
||||
f"{self.env.storage_controller_api}/debug/v1/consistency_check",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
log.info("storage controller passed consistency check")
|
||||
@@ -2694,7 +2640,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.api}/control/v1/metadata_health/update",
|
||||
f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.SCRUBBER),
|
||||
)
|
||||
@@ -2702,7 +2648,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
def metadata_health_list_unhealthy(self):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/metadata_health/unhealthy",
|
||||
f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
return response.json()
|
||||
@@ -2712,7 +2658,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.api}/control/v1/metadata_health/outdated",
|
||||
f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2735,7 +2681,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
log.info("Asking storage controller to step down")
|
||||
response = self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/step_down",
|
||||
f"{self.env.storage_controller_api}/control/v1/step_down",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
@@ -2752,7 +2698,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
|
||||
res = self.request(
|
||||
"PUT",
|
||||
f"{self.api}/debug/v1/failpoints",
|
||||
f"{self.env.storage_controller_api}/debug/v1/failpoints",
|
||||
json=[{"name": name, "actions": actions} for name, actions in pairs],
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
@@ -2822,21 +2768,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
parsed_tid, wait_ms=250
|
||||
)
|
||||
|
||||
def get_leadership_status(self) -> StorageControllerLeadershipStatus:
|
||||
metric_values = {}
|
||||
for status in StorageControllerLeadershipStatus:
|
||||
metric_value = self.get_metric_value(
|
||||
"storage_controller_leadership_status", filter={"status": status}
|
||||
)
|
||||
metric_values[status] = metric_value
|
||||
|
||||
assert list(metric_values.values()).count(1) == 1
|
||||
|
||||
for status, metric_value in metric_values.items():
|
||||
if metric_value == 1:
|
||||
return status
|
||||
|
||||
raise AssertionError("unreachable")
|
||||
@property
|
||||
def workdir(self) -> Path:
|
||||
return self.env.repo_dir
|
||||
|
||||
def __enter__(self) -> "NeonStorageController":
|
||||
return self
|
||||
@@ -2850,59 +2784,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
self.stop(immediate=True)
|
||||
|
||||
|
||||
class NeonProxiedStorageController(NeonStorageController):
|
||||
def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool):
|
||||
super(NeonProxiedStorageController, self).__init__(env, proxy_port, auth_enabled)
|
||||
self.instances: dict[int, dict[str, Any]] = {}
|
||||
|
||||
def start(
|
||||
self,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
instance_id: Optional[int] = None,
|
||||
base_port: Optional[int] = None,
|
||||
):
|
||||
assert instance_id is not None and base_port is not None
|
||||
|
||||
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
|
||||
self.instances[instance_id] = {"running": True}
|
||||
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
def stop_instance(
|
||||
self, immediate: bool = False, instance_id: Optional[int] = None
|
||||
) -> "NeonStorageController":
|
||||
assert instance_id in self.instances
|
||||
if self.instances[instance_id]["running"]:
|
||||
self.env.neon_cli.storage_controller_stop(immediate, instance_id)
|
||||
self.instances[instance_id]["running"] = False
|
||||
|
||||
self.running = any(meta["running"] for meta in self.instances.values())
|
||||
return self
|
||||
|
||||
def stop(self, immediate: bool = False) -> "NeonStorageController":
|
||||
for iid, details in self.instances.items():
|
||||
if details["running"]:
|
||||
self.env.neon_cli.storage_controller_stop(immediate, iid)
|
||||
self.instances[iid]["running"] = False
|
||||
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
def assert_no_errors(self):
|
||||
for instance_id in self.instances.keys():
|
||||
assert_no_errors(
|
||||
self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log",
|
||||
"storage_controller",
|
||||
self.allowed_errors,
|
||||
)
|
||||
|
||||
def log_contains(
|
||||
self, pattern: str, offset: None | LogCursor = None
|
||||
) -> Optional[Tuple[str, LogCursor]]:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
@dataclass
|
||||
class LogCursor:
|
||||
_line_no: int
|
||||
@@ -4639,7 +4520,7 @@ class StorageScrubber:
|
||||
|
||||
base_args = [
|
||||
str(self.env.neon_binpath / "storage_scrubber"),
|
||||
f"--controller-api={self.env.storage_controller.api_root()}",
|
||||
f"--controller-api={self.env.storage_controller_api}",
|
||||
]
|
||||
args = base_args + args
|
||||
|
||||
|
||||
@@ -1,73 +0,0 @@
|
||||
import re
|
||||
from typing import Any, Optional
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from pytest_httpserver import HTTPServer
|
||||
from werkzeug.datastructures import Headers
|
||||
from werkzeug.wrappers.request import Request
|
||||
from werkzeug.wrappers.response import Response
|
||||
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
class StorageControllerProxy:
|
||||
def __init__(self, server: HTTPServer):
|
||||
self.server: HTTPServer = server
|
||||
self.listen: str = f"http://{server.host}:{server.port}"
|
||||
self.routing_to: Optional[str] = None
|
||||
|
||||
def route_to(self, storage_controller_api: str):
|
||||
self.routing_to = storage_controller_api
|
||||
|
||||
def port(self) -> int:
|
||||
return self.server.port
|
||||
|
||||
def upcall_api_endpoint(self) -> str:
|
||||
return f"{self.listen}/upcall/v1"
|
||||
|
||||
|
||||
def proxy_request(method: str, url: str, **kwargs) -> requests.Response:
|
||||
return requests.request(method, url, **kwargs)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def storage_controller_proxy(make_httpserver):
|
||||
"""
|
||||
Proxies requests into the storage controller to the currently
|
||||
selected storage controller instance via `StorageControllerProxy.route_to`.
|
||||
|
||||
This fixture is intended for tests that need to run multiple instances
|
||||
of the storage controller at the same time.
|
||||
"""
|
||||
server = make_httpserver
|
||||
|
||||
self = StorageControllerProxy(server)
|
||||
|
||||
log.info(f"Storage controller proxy listening on {self.listen}")
|
||||
|
||||
def handler(request: Request):
|
||||
if self.route_to is None:
|
||||
log.info(f"Storage controller proxy has no routing configured for {request.url}")
|
||||
return Response("Routing not configured", status=503)
|
||||
|
||||
route_to_url = f"{self.routing_to}{request.path}"
|
||||
|
||||
log.info(f"Routing {request.url} to {route_to_url}")
|
||||
|
||||
args: dict[str, Any] = {"headers": request.headers}
|
||||
if request.is_json:
|
||||
args["json"] = request.json
|
||||
|
||||
response = proxy_request(request.method, route_to_url, **args)
|
||||
|
||||
headers = Headers()
|
||||
for key, value in response.headers.items():
|
||||
headers.add(key, value)
|
||||
|
||||
return Response(response.content, headers=headers, status=response.status_code)
|
||||
|
||||
self.server.expect_request(re.compile(".*")).respond_with_handler(handler)
|
||||
|
||||
yield self
|
||||
server.clear()
|
||||
@@ -403,7 +403,7 @@ def wait_until(
|
||||
try:
|
||||
res = func()
|
||||
except Exception as e:
|
||||
log.info("waiting for %s iteration %s failed: %s", func, i + 1, e)
|
||||
log.info("waiting for %s iteration %s failed", func, i + 1)
|
||||
last_exception = e
|
||||
if show_intermediate_error:
|
||||
log.info(e)
|
||||
|
||||
@@ -282,16 +282,15 @@ def test_snap_files(
|
||||
|
||||
env = benchmark_project_pub.pgbench_env
|
||||
connstr = benchmark_project_pub.connstr
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
|
||||
|
||||
with psycopg2.connect(connstr) as conn:
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
|
||||
is_super = cur.fetchall()[0][0]
|
||||
is_super = cur.fetchall()[0]
|
||||
assert is_super, "This benchmark won't work if we don't have superuser"
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
|
||||
|
||||
conn = psycopg2.connect(connstr)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
@@ -1,7 +1,3 @@
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
|
||||
@@ -21,17 +17,17 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
|
||||
"test_lfc_resize",
|
||||
config_lines=[
|
||||
"neon.file_cache_path='file.cache'",
|
||||
"neon.max_file_cache_size=512MB",
|
||||
"neon.file_cache_size_limit=512MB",
|
||||
"neon.max_file_cache_size=1GB",
|
||||
"neon.file_cache_size_limit=1GB",
|
||||
],
|
||||
)
|
||||
n_resize = 10
|
||||
scale = 100
|
||||
scale = 10
|
||||
|
||||
def run_pgbench(connstr: str):
|
||||
log.info(f"Start a pgbench workload on pg {connstr}")
|
||||
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
|
||||
pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr])
|
||||
pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr])
|
||||
|
||||
thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
|
||||
thread.start()
|
||||
@@ -39,21 +35,9 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
for _ in range(n_resize):
|
||||
size = random.randint(1, 512)
|
||||
cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'")
|
||||
for i in range(n_resize):
|
||||
cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'")
|
||||
cur.execute("select pg_reload_conf()")
|
||||
time.sleep(1)
|
||||
|
||||
cur.execute("alter system set neon.file_cache_size_limit='100MB'")
|
||||
cur.execute("select pg_reload_conf()")
|
||||
|
||||
thread.join()
|
||||
|
||||
lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
|
||||
lfc_file_size = os.path.getsize(lfc_file_path)
|
||||
res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True)
|
||||
lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
|
||||
log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
|
||||
assert lfc_file_size <= 512 * 1024 * 1024
|
||||
assert int(lfc_file_blocks) <= 128 * 1024
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import concurrent.futures
|
||||
import json
|
||||
import threading
|
||||
import time
|
||||
@@ -17,7 +16,6 @@ from fixtures.neon_fixtures import (
|
||||
PageserverSchedulingPolicy,
|
||||
PgBin,
|
||||
StorageControllerApiException,
|
||||
StorageControllerLeadershipStatus,
|
||||
TokenScope,
|
||||
last_flush_lsn_upload,
|
||||
)
|
||||
@@ -32,9 +30,7 @@ from fixtures.pageserver.utils import (
|
||||
timeline_delete_wait_completed,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.port_distributor import PortDistributor
|
||||
from fixtures.remote_storage import RemoteStorageKind, s3_storage
|
||||
from fixtures.storage_controller_proxy import StorageControllerProxy
|
||||
from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
|
||||
from fixtures.workload import Workload
|
||||
from mypy_boto3_s3.type_defs import (
|
||||
@@ -2097,131 +2093,6 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
|
||||
|
||||
# This is a copy of NeonEnv.start which injects the instance id and port
|
||||
# into the call to NeonStorageController.start
|
||||
def start_env(env: NeonEnv, storage_controller_port: int):
|
||||
timeout_in_seconds = 30
|
||||
|
||||
# Storage controller starts first, so that pageserver /re-attach calls don't
|
||||
# bounce through retries on startup
|
||||
env.storage_controller.start(timeout_in_seconds, 1, storage_controller_port)
|
||||
|
||||
# Wait for storage controller readiness to prevent unnecessary post start-up
|
||||
# reconcile.
|
||||
env.storage_controller.wait_until_ready()
|
||||
|
||||
# Start up broker, pageserver and all safekeepers
|
||||
futs = []
|
||||
with concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=2 + len(env.pageservers) + len(env.safekeepers)
|
||||
) as executor:
|
||||
futs.append(
|
||||
executor.submit(lambda: env.broker.try_start() or None)
|
||||
) # The `or None` is for the linter
|
||||
|
||||
for pageserver in env.pageservers:
|
||||
futs.append(
|
||||
executor.submit(
|
||||
lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
|
||||
)
|
||||
)
|
||||
|
||||
for safekeeper in env.safekeepers:
|
||||
futs.append(
|
||||
executor.submit(
|
||||
lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
|
||||
)
|
||||
)
|
||||
|
||||
for f in futs:
|
||||
f.result()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("step_down_times_out", [False, True])
|
||||
def test_storage_controller_leadership_transfer(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
storage_controller_proxy: StorageControllerProxy,
|
||||
port_distributor: PortDistributor,
|
||||
step_down_times_out: bool,
|
||||
):
|
||||
neon_env_builder.num_pageservers = 3
|
||||
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"database_url": f"127.0.0.1:{port_distributor.get_port()}",
|
||||
"start_as_candidate": True,
|
||||
}
|
||||
|
||||
neon_env_builder.storage_controller_port_override = storage_controller_proxy.port()
|
||||
|
||||
storage_controller_1_port = port_distributor.get_port()
|
||||
storage_controller_2_port = port_distributor.get_port()
|
||||
|
||||
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
start_env(env, storage_controller_1_port)
|
||||
|
||||
assert (
|
||||
env.storage_controller.get_leadership_status() == StorageControllerLeadershipStatus.LEADER
|
||||
)
|
||||
leader = env.storage_controller.get_leader()
|
||||
assert leader["address"] == f"http://127.0.0.1:{storage_controller_1_port}/"
|
||||
|
||||
if step_down_times_out:
|
||||
env.storage_controller.configure_failpoints(
|
||||
("sleep-on-step-down-handling", "return(10000)")
|
||||
)
|
||||
env.storage_controller.allowed_errors.append(".*request was dropped before completing.*")
|
||||
|
||||
tenant_count = 2
|
||||
shard_count = 4
|
||||
tenants = set(TenantId.generate() for _ in range(0, tenant_count))
|
||||
|
||||
for tid in tenants:
|
||||
env.storage_controller.tenant_create(
|
||||
tid, shard_count=shard_count, placement_policy={"Attached": 1}
|
||||
)
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
env.storage_controller.start(
|
||||
timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
|
||||
)
|
||||
|
||||
if not step_down_times_out:
|
||||
|
||||
def previous_stepped_down():
|
||||
assert (
|
||||
env.storage_controller.get_leadership_status()
|
||||
== StorageControllerLeadershipStatus.STEPPED_DOWN
|
||||
)
|
||||
|
||||
wait_until(5, 1, previous_stepped_down)
|
||||
|
||||
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
|
||||
|
||||
def new_becomes_leader():
|
||||
assert (
|
||||
env.storage_controller.get_leadership_status()
|
||||
== StorageControllerLeadershipStatus.LEADER
|
||||
)
|
||||
|
||||
wait_until(15, 1, new_becomes_leader)
|
||||
leader = env.storage_controller.get_leader()
|
||||
assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/"
|
||||
|
||||
env.storage_controller.wait_until_ready()
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
if step_down_times_out:
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
".*Leader.*did not respond to step-down request.*",
|
||||
".*Send step down request failed.*",
|
||||
".*Send step down request still failed.*",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
|
||||
# single unsharded tenant, two locations
|
||||
neon_env_builder.num_pageservers = 2
|
||||
|
||||
Reference in New Issue
Block a user