Compare commits

..

3 Commits

Author SHA1 Message Date
Conrad Ludgate
0e551edb06 rename fn 2024-08-16 08:54:25 +01:00
Conrad Ludgate
484cdccbf2 fix cancellation 2024-08-16 08:44:03 +01:00
Conrad Ludgate
39d1b78817 fix http connect config 2024-08-16 08:42:14 +01:00
30 changed files with 506 additions and 1634 deletions

View File

@@ -379,7 +379,7 @@ where
}
}
pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
match kill(pid, None) {
// Process exists, keep waiting
Ok(_) => Ok(false),

View File

@@ -15,9 +15,7 @@ use control_plane::local_env::{
};
use control_plane::pageserver::PageServerNode;
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::{
NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
};
use control_plane::storage_controller::StorageController;
use control_plane::{broker, local_env};
use pageserver_api::config::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
@@ -1054,36 +1052,6 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration {
humantime_duration.as_ref()
}
fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
let maybe_instance_id = args.get_one::<u8>("instance-id");
let base_port = args.get_one::<u16>("base-port");
if maybe_instance_id.is_some() && base_port.is_none() {
panic!("storage-controller start specificied instance-id but did not provide base-port");
}
let start_timeout = args
.get_one::<humantime::Duration>("start-timeout")
.expect("invalid value for start-timeout");
NeonStorageControllerStartArgs {
instance_id: maybe_instance_id.copied().unwrap_or(1),
base_port: base_port.copied(),
start_timeout: *start_timeout,
}
}
fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
let maybe_instance_id = args.get_one::<u8>("instance-id");
let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
NeonStorageControllerStopArgs {
instance_id: maybe_instance_id.copied().unwrap_or(1),
immediate,
}
}
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
@@ -1145,14 +1113,19 @@ async fn handle_storage_controller(
let svc = StorageController::from_env(env);
match sub_match.subcommand() {
Some(("start", start_match)) => {
if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
if let Err(e) = svc.start(get_start_timeout(start_match)).await {
eprintln!("start failed: {e}");
exit(1);
}
}
Some(("stop", stop_match)) => {
if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
let immediate = stop_match
.get_one::<String>("stop-mode")
.map(|s| s.as_str())
== Some("immediate");
if let Err(e) = svc.stop(immediate).await {
eprintln!("stop failed: {}", e);
exit(1);
}
@@ -1255,12 +1228,7 @@ async fn handle_start_all(
// Only start the storage controller if the pageserver is configured to need it
if env.control_plane_api.is_some() {
let storage_controller = StorageController::from_env(env);
if let Err(e) = storage_controller
.start(NeonStorageControllerStartArgs::with_default_instance_id(
(*retry_timeout).into(),
))
.await
{
if let Err(e) = storage_controller.start(retry_timeout).await {
eprintln!("storage_controller start failed: {:#}", e);
try_stop_all(env, true).await;
exit(1);
@@ -1390,21 +1358,10 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
eprintln!("neon broker stop failed: {e:#}");
}
// Stop all storage controller instances. In the most common case there's only one,
// but iterate though the base data directory in order to discover the instances.
let storcon_instances = env
.storage_controller_instances()
.await
.expect("Must inspect data dir");
for (instance_id, _instance_dir_path) in storcon_instances {
if env.control_plane_api.is_some() {
let storage_controller = StorageController::from_env(env);
let stop_args = NeonStorageControllerStopArgs {
instance_id,
immediate,
};
if let Err(e) = storage_controller.stop(stop_args).await {
eprintln!("Storage controller instance {instance_id} stop failed: {e:#}");
if let Err(e) = storage_controller.stop(immediate).await {
eprintln!("storage controller stop failed: {e:#}");
}
}
}
@@ -1544,18 +1501,6 @@ fn cli() -> Command {
.action(ArgAction::SetTrue)
.required(false);
let instance_id = Arg::new("instance-id")
.long("instance-id")
.help("Identifier used to distinguish storage controller instances (default 1)")
.value_parser(value_parser!(u8))
.required(false);
let base_port = Arg::new("base-port")
.long("base-port")
.help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
.value_parser(value_parser!(u16))
.required(false);
Command::new("Neon CLI")
.arg_required_else_help(true)
.version(GIT_VERSION)
@@ -1664,12 +1609,9 @@ fn cli() -> Command {
.arg_required_else_help(true)
.about("Manage storage_controller")
.subcommand(Command::new("start").about("Start storage controller")
.arg(timeout_arg.clone())
.arg(instance_id.clone())
.arg(base_port))
.arg(timeout_arg.clone()))
.subcommand(Command::new("stop").about("Stop storage controller")
.arg(stop_mode_arg.clone())
.arg(instance_id))
.arg(stop_mode_arg.clone()))
)
.subcommand(
Command::new("safekeeper")

View File

@@ -156,11 +156,6 @@ pub struct NeonStorageControllerConf {
#[serde(with = "humantime_serde")]
pub max_warming_up: Duration,
pub start_as_candidate: bool,
/// Database url used when running multiple storage controller instances
pub database_url: Option<SocketAddr>,
/// Threshold for auto-splitting a tenant into shards
pub split_threshold: Option<u64>,
@@ -179,8 +174,6 @@ impl Default for NeonStorageControllerConf {
Self {
max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
start_as_candidate: false,
database_url: None,
split_threshold: None,
max_secondary_lag_bytes: None,
}
@@ -399,36 +392,6 @@ impl LocalEnv {
}
}
/// Inspect the base data directory and extract the instance id and instance directory path
/// for all storage controller instances
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
let mut instances = Vec::default();
let dir = std::fs::read_dir(self.base_data_dir.clone())?;
for dentry in dir {
let dentry = dentry?;
let is_dir = dentry.metadata()?.is_dir();
let filename = dentry.file_name().into_string().unwrap();
let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
Some(suffix) => suffix.parse::<u8>().ok(),
None => None,
};
let is_instance_dir = is_dir && parsed_instance_id.is_some();
if !is_instance_dir {
continue;
}
instances.push((
parsed_instance_id.expect("Checked previously"),
dentry.path(),
));
}
Ok(instances)
}
pub fn register_branch_mapping(
&mut self,
branch_name: String,

View File

@@ -3,8 +3,6 @@ use crate::{
local_env::{LocalEnv, NeonStorageControllerConf},
};
use camino::{Utf8Path, Utf8PathBuf};
use hyper::Uri;
use nix::unistd::Pid;
use pageserver_api::{
controller_api::{
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
@@ -20,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use reqwest::Method;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
use std::{fs, str::FromStr, time::Duration};
use tokio::process::Command;
use tracing::instrument;
use url::Url;
@@ -31,14 +29,12 @@ use utils::{
pub struct StorageController {
env: LocalEnv,
listen: String,
private_key: Option<Vec<u8>>,
public_key: Option<String>,
postgres_port: u16,
client: reqwest::Client,
config: NeonStorageControllerConf,
// The listen addresses is learned when starting the storage controller,
// hence the use of OnceLock to init it at the right time.
listen: OnceLock<SocketAddr>,
}
const COMMAND: &str = "storage_controller";
@@ -47,36 +43,6 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
const DB_NAME: &str = "storage_controller";
pub struct NeonStorageControllerStartArgs {
pub instance_id: u8,
pub base_port: Option<u16>,
pub start_timeout: humantime::Duration,
}
impl NeonStorageControllerStartArgs {
pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
Self {
instance_id: 1,
base_port: None,
start_timeout,
}
}
}
pub struct NeonStorageControllerStopArgs {
pub instance_id: u8,
pub immediate: bool,
}
impl NeonStorageControllerStopArgs {
pub fn with_default_instance_id(immediate: bool) -> Self {
Self {
instance_id: 1,
immediate,
}
}
}
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
@@ -101,6 +67,23 @@ pub struct InspectResponse {
impl StorageController {
pub fn from_env(env: &LocalEnv) -> Self {
// Makes no sense to construct this if pageservers aren't going to use it: assume
// pageservers have control plane API set
let listen_url = env.control_plane_api.clone().unwrap();
let listen = format!(
"{}:{}",
listen_url.host_str().unwrap(),
listen_url.port().unwrap()
);
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
// port, for use by our captive postgres.
let postgres_port = listen_url
.port()
.expect("Control plane API setting should always have a port")
+ 1;
// Assume all pageservers have symmetric auth configuration: this service
// expects to use one JWT token to talk to all of them.
let ps_conf = env
@@ -143,28 +126,20 @@ impl StorageController {
Self {
env: env.clone(),
listen,
private_key,
public_key,
postgres_port,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
config: env.storage_controller.clone(),
listen: OnceLock::default(),
}
}
fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
self.env
.base_data_dir
.join(format!("storage_controller_{}", instance_id))
}
fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(
self.storage_controller_instance_dir(instance_id)
.join("storage_controller.pid"),
)
.expect("non-Unicode path")
fn pid_file(&self) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
.expect("non-Unicode path")
}
/// PIDFile for the postgres instance used to store storage controller state
@@ -209,9 +184,9 @@ impl StorageController {
}
/// Readiness check for our postgres process
async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
let bin_path = pg_bin_dir.join("pg_isready");
let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
Ok(exitcode.success())
@@ -224,8 +199,8 @@ impl StorageController {
/// who just want to run `cargo neon_local` without knowing about diesel.
///
/// Returns the database url
pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
pub async fn setup_database(&self) -> anyhow::Result<String> {
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
let pg_bin_dir = self.get_pg_bin_dir().await?;
let createdb_path = pg_bin_dir.join("createdb");
@@ -234,7 +209,7 @@ impl StorageController {
"-h",
"localhost",
"-p",
&format!("{}", postgres_port),
&format!("{}", self.postgres_port),
DB_NAME,
])
.output()
@@ -255,14 +230,13 @@ impl StorageController {
pub async fn connect_to_database(
&self,
postgres_port: u16,
) -> anyhow::Result<(
tokio_postgres::Client,
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
)> {
tokio_postgres::Config::new()
.host("localhost")
.port(postgres_port)
.port(self.postgres_port)
// The user is the ambient operating system user name.
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
//
@@ -278,115 +252,72 @@ impl StorageController {
.map_err(anyhow::Error::new)
}
pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
if err.kind() != std::io::ErrorKind::AlreadyExists {
panic!("Failed to create instance dir {instance_dir:?}");
}
}
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
// Start a vanilla Postgres process used by the storage controller for persistence.
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
.unwrap()
.join("storage_controller_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
let pg_lib_dir = self.get_pg_lib_dir().await?;
let pg_log_path = pg_data_path.join("postgres.log");
let (listen, postgres_port) = {
if let Some(base_port) = start_args.base_port {
(
format!("127.0.0.1:{base_port}"),
self.config
.database_url
.expect("--base-port requires NeonStorageControllerConf::database_url")
.port(),
)
} else {
let listen_url = self.env.control_plane_api.clone().unwrap();
let listen = format!(
"{}:{}",
listen_url.host_str().unwrap(),
listen_url.port().unwrap()
);
(listen, listen_url.port().unwrap() + 1)
if !tokio::fs::try_exists(&pg_data_path).await? {
// Initialize empty database
let initdb_path = pg_bin_dir.join("initdb");
let mut child = Command::new(&initdb_path)
.envs(vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
])
.args(["-D", pg_data_path.as_ref()])
.spawn()
.expect("Failed to spawn initdb");
let status = child.wait().await?;
if !status.success() {
anyhow::bail!("initdb failed with status {status}");
}
};
let socket_addr = listen
.parse()
.expect("listen address is a valid socket address");
self.listen
.set(socket_addr)
.expect("StorageController::listen is only set here");
// Write a minimal config file:
// - Specify the port, since this is chosen dynamically
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
// the storage controller we don't want a slow local disk to interfere with that.
//
// NB: it's important that we rewrite this file on each start command so we propagate changes
// from `LocalEnv`'s config file (`.neon/config`).
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}\nfsync=off\n", self.postgres_port),
)
.await?;
// Do we remove the pid file on stop?
let pg_started = self.is_postgres_running().await?;
let pg_lib_dir = self.get_pg_lib_dir().await?;
println!("Starting storage controller database...");
let db_start_args = [
"-w",
"-D",
pg_data_path.as_ref(),
"-l",
pg_log_path.as_ref(),
"start",
];
if !pg_started {
// Start a vanilla Postgres process used by the storage controller for persistence.
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
.unwrap()
.join("storage_controller_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
let pg_log_path = pg_data_path.join("postgres.log");
background_process::start_process(
"storage_controller_db",
&self.env.base_data_dir,
pg_bin_dir.join("pg_ctl").as_std_path(),
db_start_args,
vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
],
background_process::InitialPidFile::Create(self.postgres_pid_file()),
retry_timeout,
|| self.pg_isready(&pg_bin_dir),
)
.await?;
if !tokio::fs::try_exists(&pg_data_path).await? {
// Initialize empty database
let initdb_path = pg_bin_dir.join("initdb");
let mut child = Command::new(&initdb_path)
.envs(vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
])
.args(["-D", pg_data_path.as_ref()])
.spawn()
.expect("Failed to spawn initdb");
let status = child.wait().await?;
if !status.success() {
anyhow::bail!("initdb failed with status {status}");
}
};
// Write a minimal config file:
// - Specify the port, since this is chosen dynamically
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
// the storage controller we don't want a slow local disk to interfere with that.
//
// NB: it's important that we rewrite this file on each start command so we propagate changes
// from `LocalEnv`'s config file (`.neon/config`).
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}\nfsync=off\n", postgres_port),
)
.await?;
println!("Starting storage controller database...");
let db_start_args = [
"-w",
"-D",
pg_data_path.as_ref(),
"-l",
pg_log_path.as_ref(),
"start",
];
background_process::start_process(
"storage_controller_db",
&self.env.base_data_dir,
pg_bin_dir.join("pg_ctl").as_std_path(),
db_start_args,
vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
],
background_process::InitialPidFile::Create(self.postgres_pid_file()),
&start_args.start_timeout,
|| self.pg_isready(&pg_bin_dir, postgres_port),
)
.await?;
// Run migrations on every startup, in case something changed.
self.setup_database(postgres_port).await?;
}
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
// Run migrations on every startup, in case something changed.
let database_url = self.setup_database().await?;
// We support running a startup SQL script to fiddle with the database before we launch storcon.
// This is used by the test suite.
@@ -408,7 +339,7 @@ impl StorageController {
}
}
};
let (mut client, conn) = self.connect_to_database(postgres_port).await?;
let (mut client, conn) = self.connect_to_database().await?;
let conn = tokio::spawn(conn);
let tx = client.build_transaction();
let tx = tx.start().await?;
@@ -417,20 +348,9 @@ impl StorageController {
drop(client);
conn.await??;
let listen = self
.listen
.get()
.expect("cell is set earlier in this function");
let address_for_peers = Uri::builder()
.scheme("http")
.authority(format!("{}:{}", listen.ip(), listen.port()))
.path_and_query("")
.build()
.unwrap();
let mut args = vec![
"-l",
&listen.to_string(),
&self.listen,
"--dev",
"--database-url",
&database_url,
@@ -438,17 +358,10 @@ impl StorageController {
&humantime::Duration::from(self.config.max_offline).to_string(),
"--max-warming-up-interval",
&humantime::Duration::from(self.config.max_warming_up).to_string(),
"--address-for-peers",
&address_for_peers.to_string(),
]
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
if self.config.start_as_candidate {
args.push("--start-as-candidate".to_string());
}
if let Some(private_key) = &self.private_key {
let claims = Claims::new(None, Scope::PageServerApi);
let jwt_token =
@@ -481,15 +394,15 @@ impl StorageController {
background_process::start_process(
COMMAND,
&instance_dir,
&self.env.base_data_dir,
&self.env.storage_controller_bin(),
args,
vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
],
background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
&start_args.start_timeout,
background_process::InitialPidFile::Create(self.pid_file()),
retry_timeout,
|| async {
match self.ready().await {
Ok(_) => Ok(true),
@@ -502,35 +415,8 @@ impl StorageController {
Ok(())
}
pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
background_process::stop_process(
stop_args.immediate,
COMMAND,
&self.pid_file(stop_args.instance_id),
)?;
let storcon_instances = self.env.storage_controller_instances().await?;
for (instance_id, instanced_dir_path) in storcon_instances {
if instance_id == stop_args.instance_id {
continue;
}
let pid_file = instanced_dir_path.join("storage_controller.pid");
let pid = tokio::fs::read_to_string(&pid_file)
.await
.map_err(|err| {
anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
})?
.parse::<i32>()
.expect("pid is valid i32");
let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
if other_proc_alive {
// There is another storage controller instance running, so we return
// and leave the database running.
return Ok(());
}
}
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -543,51 +429,27 @@ impl StorageController {
.wait()
.await?;
if !stop_status.success() {
match self.is_postgres_running().await {
Ok(false) => {
println!("Storage controller database is already stopped");
return Ok(());
}
Ok(true) => {
anyhow::bail!("Failed to stop storage controller database");
}
Err(err) => {
anyhow::bail!("Failed to stop storage controller database: {err}");
}
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_status_args)
.spawn()?
.wait()
.await?;
// pg_ctl status returns this exit code if postgres is not running: in this case it is
// fine that stop failed. Otherwise it is an error that stop failed.
const PG_STATUS_NOT_RUNNING: i32 = 3;
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
println!("Storage controller database is already stopped");
return Ok(());
} else {
anyhow::bail!("Failed to stop storage controller database: {stop_status}")
}
}
Ok(())
}
async fn is_postgres_running(&self) -> anyhow::Result<bool> {
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_status_args)
.spawn()?
.wait()
.await?;
// pg_ctl status returns this exit code if postgres is not running: in this case it is
// fine that stop failed. Otherwise it is an error that stop failed.
const PG_STATUS_NOT_RUNNING: i32 = 3;
const PG_NO_DATA_DIR: i32 = 4;
const PG_STATUS_RUNNING: i32 = 0;
match status_exitcode.code() {
Some(PG_STATUS_NOT_RUNNING) => Ok(false),
Some(PG_NO_DATA_DIR) => Ok(false),
Some(PG_STATUS_RUNNING) => Ok(true),
Some(code) => Err(anyhow::anyhow!(
"pg_ctl status returned unexpected status code: {:?}",
code
)),
None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
}
}
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
let category = match path.find('/') {
Some(idx) => &path[..idx],
@@ -613,31 +475,15 @@ impl StorageController {
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// In the special case of the `storage_controller start` subcommand, we wish
// to use the API endpoint of the newly started storage controller in order
// to pass the readiness check. In this scenario [`Self::listen`] will be set
// (see [`Self::start`]).
//
// Otherwise, we infer the storage controller api endpoint from the configured
// control plane API.
let url = if let Some(socket_addr) = self.listen.get() {
Url::from_str(&format!(
"http://{}:{}/{path}",
socket_addr.ip().to_canonical(),
socket_addr.port()
))
.unwrap()
} else {
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let listen_url = self.env.control_plane_api.clone().unwrap();
Url::from_str(&format!(
"http://{}:{}/{path}",
listen_url.host_str().unwrap(),
listen_url.port().unwrap()
))
.unwrap()
};
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let listen_url = self.env.control_plane_api.clone().unwrap();
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
listen_url.host_str().unwrap(),
listen_url.port().unwrap()
))
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {

View File

@@ -1,265 +0,0 @@
# Physical Replication
This RFC is a bit special in that we have already implemented physical
replication a long time ago. However, we never properly wrote down all
the decisions and assumptions, and in the last months when more users
have started to use the feature, numerous issues have surfaced.
This RFC documents the design decisions that have been made.
## Summary
PostgreSQL has a feature called streaming replication, where a replica
streams WAL from the primary and continuously applies it. It is also
known as "physical replication", to distinguish it from logical
replication. In PostgreSQL, a replica is initialized by taking a
physical backup of the primary. In Neon, the replica is initialized
from a slim "base backup" from the pageserver, just like a primary,
and the primary and the replicas connect to the same pageserver,
sharing the storage.
There are two kinds of read-only replicas in Neon:
- replicas that follow the primary, and
- "static" replicas that are pinned at a particular LSN.
A static replica is useful e.g. for performing time-travel queries and
running one-off slow queries without affecting the primary. A replica
that follows the primary can be used e.g. to scale out read-only
workloads.
## Motivation
Read-only replicas allow offloading read-only queries. It's useful for
isolation, if you want to make sure that read-only queries don't
affect the primary, and it's also an easy way to provide guaranteed
read-only access to an application, without having to mess with access
controls.
## Non Goals (if relevant)
This RFC is all about WAL-based *physical* replication. Logical
replication is a different feature.
Neon also has the capability to launch "static" read-only nodes which
do not follow the primary, but are pinned to a particular LSN. They
can be used for long-running one-off queries, or for Point-in-time
queries. They work similarly to read replicas that follow the primary,
but some things are simpler: there are no concerns about cache
invalidation when the data changes on the primary, or worrying about
transactions that are in-progress on the primary.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
- Control plane launches the replica
- Replica Postgres instance connects to the safekeepers, to stream the WAL
- The primary does not know about the standby, except for the hot standby feedback
- The primary and replicas all connect to the same pageservers
# Context
Some useful things to know about hot standby and replicas in
PostgreSQL.
## PostgreSQL startup sequence
"Running" and "start up" terms are little imprecise. PostgreSQL
replica startup goes through several stages:
1. First, the process is started up, and various initialization steps
are performed, like initializing shared memory. If you try to
connect to the server in this stage, you get an error: ERROR: the
database system is starting up. This stage happens very quickly, no
2. Then the server reads the checpoint record from the WAL and starts
the WAL replay starting from the checkpoint. This works differently
in Neon: we start the WAL replay at the basebackup LSN, not from a
checkpoint! If you connect to the server in this state, you get an
error: ERROR: the database system is not yet accepting
connections. We proceed to the next stage, when the WAL replay sees
a running-xacts record. Or in Neon, the "CLOG scanning" mechanism
can allow us to move directly to next stage, with all the caveats
listed in this RFC.
3. When the running-xacts information is established, the server
starts to accept connections normally.
From PostgreSQL's point of view, the server is already running in
stage 2, even though it's not accepting connections yet. Our
`compute_ctl` does not consider it as running until stage 3. If the
transition from stage 2 to 3 doesn't happen fast enough, the control
plane will mark the start operation as failed.
## Decisions, Issues
### Cache invalidation in replica
When a read replica follows the primary in PostgreSQL, it needs to
stream all the WAL from the primary and apply all the records, to keep
the local copy of the data consistent with the primary. In Neon, the
replica can fetch the updated page versions from the pageserver, so
it's not necessary to apply all the WAL. However, it needs to ensure
that any pages that are currently in the Postgres buffer cache, or the
Local File Cache, are either updated, or thrown away so that the next
read of the page will fetch the latest version.
We choose to apply the WAL records for pages that are already in the
buffer cache, and skip records for other pages. Somewhat arbitrarily,
we also apply records affecting catalog relations, fetching the old
page version from the pageserver if necessary first. See
`neon_redo_read_buffer_filter()` function.
The replica wouldn't necessarily need to see all the WAL records, only
the records that apply to cached pages. For simplicity, we do stream
all the WAL to the replica, and the replica simply ignores WAL records
that require no action.
Like in PostgreSQL, the read replica maintains a "replay LSN", which
is the LSN up to which the replica has received and replayed the
WAL. The replica can lag behind the primary, if it cannot quite keep
up with the primary, or if a long-running query conflicts with changes
that are about to be applied, or even intentionally if the user wishes
to see delayed data (see recovery_min_apply_delay). It's important
that the replica sees a consistent view of the whole cluster at the
replay LSN, when it's lagging behind.
In Neon, the replica connects to a safekeeper to get the WAL
stream. That means that the safekeepers must be able to regurgitate
the original WAL as far back as the replay LSN of any running read
replica. (A static read-only node that does not follow the primary
does not require a WAL stream however). The primary does not need to
be running, and when it is, the replicas don't incur any extra
overhead to the primary (see hot standby feedback though).
### In-progress transactions
In PostgreSQL, when a hot standby server starts up, it cannot
immediately open up for queries (see [PostgreSQL startup
sequence]). It first needs to establish a complete list of in-progress
transactions, including subtransactions, that are running at the
primary, at the current replay LSN. Normally that happens quickly,
when the replica sees a "running-xacts" WAL record, because the
primary writes a running-xacts WAL record at every checkpoint, and in
PostgreSQL the replica always starts the WAL replay from a checkpoint
REDO point. (A shutdown checkpoint WAL record also implies that all
the non-prepared transactions have ended.) If there are a lot of
subtransactions in progress, however, the standby might need to wait
for old transactions to complete before it can open up for queries.
In Neon that problem is worse: a replica can start at any LSN, so
there's no guarantee that it will see a running-xacts record any time
soon. In particular, if the primary is not running when the replica is
started, it might never see a running-xacts record.
To make things worse, we initially missed this issue, and always
started accepting queries at replica startup, even if it didn't have
the transaction information. That could lead to incorrect query
results and data corruption later. However, as we fixed that, we
introduced a new problem compared to what we had before: previously
the replica would always start up, but after fixing that bug, it might
not. In a superficial way, the old behavior was better (but could lead
to serious issues later!). That made fixing that bug was very hard,
because as we fixed it, we made things (superficially) worse for
others.
See https://github.com/neondatabase/neon/pull/7288 which fixed the
bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323
and https://github.com/neondatabase/neon/pull/8484 to try to claw back
the cases that started to cause trouble as fixing it. As of this
writing, there are still cases where a replica might not immediately
start up, causing the control plane operation to fail, the remaining
issues are tracked in https://github.com/neondatabase/neon/issues/6211.
One long-term fix for this is to switch to using so-called CSN
snapshots in read replica. That would make it unnecessary to have the
full in-progress transaction list in the replica at startup time. See
https://commitfest.postgresql.org/48/4912/ for a work-in-progress
patch to upstream to implement that.
Another thing we could do is to teach the control plane about that
distinction between "starting up" and "running but haven't received
running-xacts information yet", so that we could keep the replica
waiting longer in that stage, and also give any client connections the
same `ERROR: the database system is not yet accepting connections`
error that you get in standalone PostgreSQL in that state.
### Recovery conflicts and Hot standby feedback
It's possible that a tuple version is vacuumed away in the primary,
even though it is still needed by a running transactions in the
replica. This is called a "recovery conflict", and PostgreSQL provides
various options for dealing with it. By default, the WAL replay will
wait up to 30 s for the conflicting query to finish. After that, it
will kill the running query, so that the WAL replay can proceed.
Another way to avoid the situation is to enable the
[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK)
option. When it is enabled, the primary will refrain from vacuuming
tuples that are still needed in the primary. That means potentially
bloating the primary, which violates the usual rule that read replicas
don't affect the operations on the primary, which is why it's off by
default. We leave it to users to decide if they want to turn it on,
same as PostgreSQL.
Neon supports `hot_standby_feedback` by passing the feedback messages
from the replica to the safekeepers, and from safekeepers to the
primary.
### Relationship of settings between primary and replica
In order to enter hot standby mode, some configuration options need to
be set to the same or larger values in the standby, compared to the
primary. See [explanation in the PostgreSQL
docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN)
In Neon, we have this problem too. To prevent customers from hitting
it, the control plane automatically adjusts the settings of a replica,
so that they match or exceed the primary's settings (see
https://github.com/neondatabase/cloud/issues/14903). However, you
can still hit the issue if the primary is restarted with larger
settings, while the replica is running.
### Interaction with Pageserver GC
The read replica can lag behind the primary. If there are recovery
conflicts or the replica cannot keep up for some reason, the lag can
in principle grow indefinitely. The replica will issue all GetPage
requests to the pageservers at the current replay LSN, and needs to
see the old page versions.
If the retention period in the pageserver is set to be small, it may
have already garbage collected away the old page versions. That will
cause read errors in the compute, and can mean that the replica cannot
make progress with the replication anymore.
There is a mechanism for replica to pass information about its replay
LSN to the pageserver, so that the pageserver refrains from GC'ing
data that is still needed by the standby. It's called
'standby_horizon' in the pageserver code, see
https://github.com/neondatabase/neon/pull/7368. A separate "lease"
mechanism also is in the works, where the replica could hold a lease
on the old LSN, preventing the pageserver from advancing the GC
horizon past that point. The difference is that the standby_horizon
mechanism relies on a feedback message from replica to safekeeper,
while the least API is exposed directly from the pageserver. A static
read-only node is not connected to safekeepers, so it cannot use the
standby_horizon mechanism.
### Synchronous replication
We haven't put any effort into synchronous replication yet.
PostgreSQL provides multiple levels of synchronicity. In the weaker
levels, a transaction is not acknowledged as committed to the client
in the primary until the WAL has been streamed to a replica or flushed
to disk there. Those modes don't make senses in Neon, because the
safekeepers handle durability.
`synchronous_commit=remote_apply` mode would make sense. In that mode,
the commit is not acknowledged to the client until it has been
replayed in the replica. That ensures that after commit, you can see
the commit in the replica too (aka. read-your-write consistency).

View File

@@ -50,6 +50,7 @@ pub mod defaults {
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
DEFAULT_PG_LISTEN_PORT,
};
use pageserver_api::models::ImageCompressionAlgorithm;
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
@@ -89,7 +90,8 @@ pub mod defaults {
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
ImageCompressionAlgorithm::Disabled;
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
@@ -476,7 +478,7 @@ impl PageServerConfigBuilder {
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
)),
image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
l0_flush: Set(L0FlushConfig::default()),
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
@@ -1063,7 +1065,7 @@ impl PageServerConf {
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
.expect("Invalid default constant"),
),
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1303,7 +1305,7 @@ background_task_maximum_delay = '334 s'
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
.expect("Invalid default constant")
),
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1376,7 +1378,7 @@ background_task_maximum_delay = '334 s'
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
.expect("Invalid default constant")
),
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),

View File

@@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant {
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapTimeline {
#[serde_as(as = "DisplayFromStr")]
pub(crate) timeline_id: TimelineId,
pub(super) timeline_id: TimelineId,
pub(crate) layers: Vec<HeatMapLayer>,
pub(super) layers: Vec<HeatMapLayer>,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapLayer {
pub(crate) name: LayerName,
pub(crate) metadata: LayerFileMetadata,
pub(super) name: LayerName,
pub(super) metadata: LayerFileMetadata,
#[serde_as(as = "TimestampSeconds<i64>")]
pub(super) access_time: SystemTime,

View File

@@ -208,8 +208,6 @@ impl SplitDeltaLayerWriter {
#[cfg(test)]
mod tests {
use rand::{RngCore, SeedableRng};
use crate::{
tenant::{
harness::{TenantHarness, TIMELINE_ID},
@@ -231,10 +229,7 @@ mod tests {
}
fn get_large_img() -> Bytes {
let mut rng = rand::rngs::SmallRng::seed_from_u64(42);
let mut data = vec![0; 8192];
rng.fill_bytes(&mut data);
data.into()
vec![0; 8192].into()
}
#[tokio::test]

View File

@@ -2977,7 +2977,11 @@ impl Timeline {
LayerVisibilityHint::Visible => {
// Layer is visible to one or more read LSNs: elegible for inclusion in layer map
let last_activity_ts = layer.latest_activity();
Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
Some(HeatMapLayer::new(
layer.layer_desc().layer_name(),
layer.metadata(),
last_activity_ts,
))
}
LayerVisibilityHint::Covered => {
// Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
@@ -2986,23 +2990,7 @@ impl Timeline {
}
});
let mut layers = resident.collect::<Vec<_>>();
// Sort layers in order of which to download first. For a large set of layers to download, we
// want to prioritize those layers which are most likely to still be in the resident many minutes
// or hours later:
// - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
// only exist for a few minutes before being compacted into L1s.
// - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
// the layer is likely to be covered by an image layer during compaction.
layers.sort_by_key(|(desc, _meta, _atime)| {
std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
});
let layers = layers
.into_iter()
.map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
.collect();
let layers = resident.collect();
Some(HeatMapTimeline::new(self.timeline_id, layers))
}
@@ -4528,7 +4516,6 @@ impl DurationRecorder {
/// the layer descriptor requires the user to provide the ranges, which should cover all
/// keys specified in the `data` field.
#[cfg(test)]
#[derive(Clone)]
pub struct DeltaLayerTestDesc {
pub lsn_range: Range<Lsn>,
pub key_range: Range<Key>,
@@ -4558,13 +4545,6 @@ impl DeltaLayerTestDesc {
data,
}
}
pub(crate) fn layer_name(&self) -> LayerName {
LayerName::Delta(super::storage_layer::DeltaLayerName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
})
}
}
impl Timeline {
@@ -5788,110 +5768,12 @@ fn is_send() {
#[cfg(test)]
mod tests {
use pageserver_api::key::Key;
use utils::{id::TimelineId, lsn::Lsn};
use crate::{
repository::Value,
tenant::{
harness::{test_img, TenantHarness},
layer_map::LayerMap,
storage_layer::{Layer, LayerName},
timeline::{DeltaLayerTestDesc, EvictionError},
Timeline,
},
use crate::tenant::{
harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline,
};
#[tokio::test]
async fn test_heatmap_generation() {
let harness = TenantHarness::create("heatmap_generation").await.unwrap();
let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x20),
vec![(
Key::from_hex("620000000033333333444444445500000000").unwrap(),
Lsn(0x11),
Value::Image(test_img("foo")),
)],
);
let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x20),
vec![(
Key::from_hex("720000000033333333444444445500000000").unwrap(),
Lsn(0x11),
Value::Image(test_img("foo")),
)],
);
let l0_delta = DeltaLayerTestDesc::new(
Lsn(0x20)..Lsn(0x30),
Key::from_hex("000000000000000000000000000000000000").unwrap()
..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
vec![(
Key::from_hex("720000000033333333444444445500000000").unwrap(),
Lsn(0x25),
Value::Image(test_img("foo")),
)],
);
let delta_layers = vec![
covered_delta.clone(),
visible_delta.clone(),
l0_delta.clone(),
];
let image_layer = (
Lsn(0x40),
vec![(
Key::from_hex("620000000033333333444444445500000000").unwrap(),
test_img("bar"),
)],
);
let image_layers = vec![image_layer];
let (tenant, ctx) = harness.load().await;
let timeline = tenant
.create_test_timeline_with_layers(
TimelineId::generate(),
Lsn(0x10),
14,
&ctx,
delta_layers,
image_layers,
Lsn(0x100),
)
.await
.unwrap();
// Layer visibility is an input to heatmap generation, so refresh it first
timeline.update_layer_visibility().await.unwrap();
let heatmap = timeline
.generate_heatmap()
.await
.expect("Infallible while timeline is not shut down");
assert_eq!(heatmap.timeline_id, timeline.timeline_id);
// L0 should come last
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
let mut last_lsn = Lsn::MAX;
for layer in heatmap.layers {
// Covered layer should be omitted
assert!(layer.name != covered_delta.layer_name());
let layer_lsn = match &layer.name {
LayerName::Delta(d) => d.lsn_range.end,
LayerName::Image(i) => i.lsn,
};
// Apart from L0s, newest Layers should come first
if !LayerMap::is_l0(layer.name.key_range()) {
assert!(layer_lsn <= last_lsn);
last_lsn = layer_lsn;
}
}
}
#[tokio::test]
async fn two_layer_eviction_attempts_at_the_same_time() {
let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")

View File

@@ -41,8 +41,6 @@
#include "hll.h"
#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
/*
* Local file cache is used to temporary store relations pages in local file system.
* All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -53,43 +51,19 @@
*
* Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
* its consistency.
*
* ## Holes
*
* The LFC can be resized on the fly, up to a maximum size that's determined
* at server startup (neon.max_file_cache_size). After server startup, we
* expand the underlying file when needed, until it reaches the soft limit
* (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink
* the LFC by punching holes in the underlying file with a
* fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
* shrink, but the disk space it uses does.
*
* Each hole is tracked by a dummy FileCacheEntry, which are kept in the
* 'holes' linked list. They are entered into the chunk hash table, with a
* special key where the blockNumber is used to store the 'offset' of the
* hole, and all other fields are zero. Holes are never looked up in the hash
* table, we only enter them there to have a FileCacheEntry that we can keep
* in the linked list. If the soft limit is raised again, we reuse the holes
* before extending the nominal size of the file.
*/
/* Local file storage allocation chunk.
* Should be power of two. Using larger than page chunks can
* Should be power of two and not less than 32. Using larger than page chunks can
* 1. Reduce hash-map memory footprint: 8TB database contains billion pages
* and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
* 1Mb chunks can reduce hash map size to 320Mb.
* 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
*/
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
/*
* Smaller chunk seems to be better for OLTP workload
*/
// #define BLOCKS_PER_CHUNK 8 /* 64kb chunk */
#define MB ((uint64)1024*1024)
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32)
typedef struct FileCacheEntry
{
@@ -97,8 +71,8 @@ typedef struct FileCacheEntry
uint32 hash;
uint32 offset;
uint32 access_count;
uint32 bitmap[CHUNK_BITMAP_SIZE];
dlist_node list_node; /* LRU/holes list node */
uint32 bitmap[BLOCKS_PER_CHUNK / 32];
dlist_node lru_node; /* LRU list node */
} FileCacheEntry;
typedef struct FileCacheControl
@@ -113,7 +87,6 @@ typedef struct FileCacheControl
uint64 writes;
dlist_head lru; /* double linked list for LRU replacement
* algorithm */
dlist_head holes; /* double linked list of punched holes */
HyperLogLogState wss_estimation; /* estimation of working set size */
} FileCacheControl;
@@ -162,7 +135,6 @@ lfc_disable(char const *op)
lfc_ctl->used = 0;
lfc_ctl->limit = 0;
dlist_init(&lfc_ctl->lru);
dlist_init(&lfc_ctl->holes);
if (lfc_desc > 0)
{
@@ -242,18 +214,18 @@ lfc_shmem_startup(void)
if (!found)
{
int fd;
uint32 n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
info.keysize = sizeof(BufferTag);
info.entrysize = sizeof(FileCacheEntry);
/*
* n_chunks+1 because we add new element to hash table before eviction
* lfc_size+1 because we add new element to hash table before eviction
* of victim
*/
lfc_hash = ShmemInitHash("lfc_hash",
n_chunks + 1, n_chunks + 1,
lfc_size + 1, lfc_size + 1,
&info,
HASH_ELEM | HASH_BLOBS);
lfc_ctl->generation = 0;
@@ -263,7 +235,6 @@ lfc_shmem_startup(void)
lfc_ctl->misses = 0;
lfc_ctl->writes = 0;
dlist_init(&lfc_ctl->lru);
dlist_init(&lfc_ctl->holes);
/* Initialize hyper-log-log structure for estimating working set size */
initSHLL(&lfc_ctl->wss_estimation);
@@ -339,31 +310,14 @@ lfc_change_limit_hook(int newval, void *extra)
* Shrink cache by throwing away least recently accessed chunks and
* returning their space to file system
*/
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
FileCacheEntry *hole;
uint32 offset = victim->offset;
uint32 hash;
bool found;
BufferTag holetag;
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
CriticalAssert(victim->access_count == 0);
Assert(victim->access_count == 0);
#ifdef FALLOC_FL_PUNCH_HOLE
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
neon_log(LOG, "Failed to punch hole in file: %m");
#endif
/* We remove the old entry, and re-enter a hole to the hash table */
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
memset(&holetag, 0, sizeof(holetag));
holetag.blockNum = offset;
hash = get_hash_value(lfc_hash, &holetag);
hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
hole->hash = hash;
hole->offset = offset;
hole->access_count = 0;
CriticalAssert(!found);
dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
lfc_ctl->used -= 1;
}
lfc_ctl->limit = new_size;
@@ -455,8 +409,6 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
CopyNRelFileInfoToBufTag(tag, rinfo);
tag.forkNum = forkNum;
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
hash = get_hash_value(lfc_hash, &tag);
LWLockAcquire(lfc_lock, LW_SHARED);
@@ -488,7 +440,6 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
tag.forkNum = forkNum;
tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1));
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
hash = get_hash_value(lfc_hash, &tag);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -519,7 +470,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
{
bool has_remaining_pages;
for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
{
if (entry->bitmap[i] != 0)
{
@@ -534,8 +485,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
*/
if (!has_remaining_pages)
{
dlist_delete(&entry->list_node);
dlist_push_head(&lfc_ctl->lru, &entry->list_node);
dlist_delete(&entry->lru_node);
dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
}
}
@@ -574,8 +525,6 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
CopyNRelFileInfoToBufTag(tag, rinfo);
tag.forkNum = forkNum;
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
hash = get_hash_value(lfc_hash, &tag);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -602,7 +551,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
}
/* Unlink entry from LRU list to pin it for the duration of IO operation */
if (entry->access_count++ == 0)
dlist_delete(&entry->list_node);
dlist_delete(&entry->lru_node);
generation = lfc_ctl->generation;
entry_offset = entry->offset;
@@ -620,12 +569,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
if (lfc_ctl->generation == generation)
{
CriticalAssert(LFC_ENABLED());
Assert(LFC_ENABLED());
lfc_ctl->hits += 1;
pgBufferUsage.file_cache.hits += 1;
CriticalAssert(entry->access_count > 0);
Assert(entry->access_count > 0);
if (--entry->access_count == 0)
dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
}
else
result = false;
@@ -664,8 +613,6 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
tag.forkNum = forkNum;
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
CopyNRelFileInfoToBufTag(tag, rinfo);
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
hash = get_hash_value(lfc_hash, &tag);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -685,7 +632,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
* operation
*/
if (entry->access_count++ == 0)
dlist_delete(&entry->list_node);
dlist_delete(&entry->lru_node);
}
else
{
@@ -708,26 +655,13 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
{
/* Cache overflow: evict least recently used chunk */
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
CriticalAssert(victim->access_count == 0);
Assert(victim->access_count == 0);
entry->offset = victim->offset; /* grab victim's chunk */
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
neon_log(DEBUG2, "Swap file cache page");
}
else if (!dlist_is_empty(&lfc_ctl->holes))
{
/* We can reuse a hole that was left behind when the LFC was shrunk previously */
FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
uint32 offset = hole->offset;
bool found;
hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
CriticalAssert(found);
lfc_ctl->used += 1;
entry->offset = offset; /* reuse the hole */
}
else
{
lfc_ctl->used += 1;
@@ -755,11 +689,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
if (lfc_ctl->generation == generation)
{
CriticalAssert(LFC_ENABLED());
Assert(LFC_ENABLED());
/* Place entry to the head of LRU list */
CriticalAssert(entry->access_count > 0);
Assert(entry->access_count > 0);
if (--entry->access_count == 0)
dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
}
@@ -774,6 +708,7 @@ typedef struct
} NeonGetStatsCtx;
#define NUM_NEON_GET_STATS_COLS 2
#define NUM_NEON_GET_STATS_ROWS 3
PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
Datum
@@ -809,6 +744,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
INT8OID, -1, 0);
fctx->tupdesc = BlessTupleDesc(tupledesc);
funcctx->max_calls = NUM_NEON_GET_STATS_ROWS;
funcctx->user_fctx = fctx;
/* Return to original context when allocating transient memory */
@@ -842,11 +778,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
if (lfc_ctl)
value = lfc_ctl->writes;
break;
case 4:
key = "file_cache_size";
if (lfc_ctl)
value = lfc_ctl->size;
break;
default:
SRF_RETURN_DONE(funcctx);
}
@@ -970,7 +901,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
hash_seq_init(&status, lfc_hash);
while ((entry = hash_seq_search(&status)) != NULL)
{
for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
n_pages += pg_popcount32(entry->bitmap[i]);
}
}

View File

@@ -54,10 +54,6 @@
#define BufTagGetNRelFileInfo(tag) tag.rnode
#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)
#define InvalidRelFileNumber InvalidOid
#define SMgrRelGetRelInfo(reln) \
(reln->smgr_rnode.node)

View File

@@ -151,21 +151,34 @@ impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
#[derive(Clone)]
pub struct CancelClosure {
socket_addr: SocketAddr,
cancel_token: CancelToken,
cancel_token: Option<CancelToken>,
}
impl CancelClosure {
pub fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
Self {
socket_addr,
cancel_token,
cancel_token: Some(cancel_token),
}
}
#[cfg(test)]
pub fn test() -> Self {
use std::net::{Ipv4Addr, SocketAddrV4};
Self {
socket_addr: SocketAddr::V4(SocketAddrV4::new(Ipv4Addr::from_bits(0), 0)),
cancel_token: None,
}
}
/// Cancels the query running on user's compute node.
pub async fn try_cancel_query(self) -> Result<(), CancelError> {
let socket = TcpStream::connect(self.socket_addr).await?;
self.cancel_token.cancel_query_raw(socket, NoTls).await?;
info!("query was cancelled");
if let Some(cancel_token) = self.cancel_token {
let socket = TcpStream::connect(self.socket_addr).await?;
cancel_token.cancel_query_raw(socket, NoTls).await?;
info!("query was cancelled");
}
Ok(())
}
}

View File

@@ -16,8 +16,10 @@ use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError}
use std::{io, net::SocketAddr, sync::Arc, time::Duration};
use thiserror::Error;
use tokio::net::TcpStream;
use tokio_postgres::tls::MakeTlsConnect;
use tokio_postgres_rustls::MakeRustlsConnect;
use tokio_postgres::{
tls::{MakeTlsConnect, NoTlsError},
Client, Connection,
};
use tracing::{error, info, warn};
const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
@@ -42,6 +44,12 @@ pub enum ConnectionError {
TooManyConnectionAttempts(#[from] ApiLockError),
}
impl From<NoTlsError> for ConnectionError {
fn from(value: NoTlsError) -> Self {
Self::CouldNotConnect(io::Error::new(io::ErrorKind::Other, value.to_string()))
}
}
impl UserFacingError for ConnectionError {
fn to_string_client(&self) -> String {
use ConnectionError::*;
@@ -273,6 +281,30 @@ pub struct PostgresConnection {
}
impl ConnCfg {
/// Connect to a corresponding compute node.
pub async fn managed_connect<M: MakeTlsConnect<tokio::net::TcpStream>>(
&self,
ctx: &RequestMonitoring,
timeout: Duration,
mktls: &mut M,
) -> Result<(SocketAddr, Client, Connection<TcpStream, M::Stream>), ConnectionError>
where
ConnectionError: From<M::Error>,
{
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
drop(pause);
let tls = mktls.make_tls_connect(host)?;
// connect_raw() will not use TLS if sslmode is "disable"
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
let (client, connection) = self.0.connect_raw(stream, tls).await?;
drop(pause);
Ok((socket_addr, client, connection))
}
/// Connect to a corresponding compute node.
pub async fn connect(
&self,
@@ -281,10 +313,6 @@ impl ConnCfg {
aux: MetricsAuxInfo,
timeout: Duration,
) -> Result<PostgresConnection, ConnectionError> {
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
drop(pause);
let client_config = if allow_self_signed_compute {
// Allow all certificates for creating the connection
let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
@@ -298,21 +326,15 @@ impl ConnCfg {
let client_config = client_config.with_no_client_auth();
let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
&mut mk_tls,
host,
)?;
// connect_raw() will not use TLS if sslmode is "disable"
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
let (client, connection) = self.0.connect_raw(stream, tls).await?;
drop(pause);
let (socket_addr, client, connection) =
self.managed_connect(ctx, timeout, &mut mk_tls).await?;
tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
let stream = connection.stream.into_inner();
info!(
cold_start_info = ctx.cold_start_info().as_str(),
"connected to compute node at {host} ({socket_addr}) sslmode={:?}",
"connected to compute node ({socket_addr}) sslmode={:?}",
self.0.get_ssl_mode()
);

View File

@@ -5,7 +5,8 @@ use tracing::{field::display, info};
use crate::{
auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
compute,
cancellation::CancelClosure,
compute::{self, ConnectionError},
config::{AuthenticationConfig, ProxyConfig},
console::{
errors::{GetAuthInfoError, WakeComputeError},
@@ -142,7 +143,7 @@ pub enum HttpConnError {
#[error("pooled connection closed at inconsistent state")]
ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
#[error("could not connection to compute")]
ConnectionError(#[from] tokio_postgres::Error),
ConnectionError(#[from] ConnectionError),
#[error("could not get auth info")]
GetAuthInfo(#[from] GetAuthInfoError),
@@ -229,17 +230,16 @@ impl ConnectMechanism for TokioMechanism {
let host = node_info.config.get_host()?;
let permit = self.locks.get_permit(&host).await?;
let mut config = (*node_info.config).clone();
let config = config
.user(&self.conn_info.user_info.user)
.password(&*self.conn_info.password)
.dbname(&self.conn_info.dbname)
.connect_timeout(timeout);
let (socket_addr, client, connection) = permit.release_result(
node_info
.config
.managed_connect(ctx, timeout, &mut tokio_postgres::NoTls)
.await,
)?;
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
let res = config.connect(tokio_postgres::NoTls).await;
drop(pause);
let (client, connection) = permit.release_result(res)?;
// NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
// Yet another reason to rework the connection establishing code.
let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
Ok(poll_client(
@@ -250,8 +250,14 @@ impl ConnectMechanism for TokioMechanism {
connection,
self.conn_id,
node_info.aux.clone(),
cancel_closure,
))
}
fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
fn update_connect_config(&self, config: &mut compute::ConnCfg) {
config
.user(&self.conn_info.user_info.user)
.dbname(&self.conn_info.dbname)
.password(&self.conn_info.password);
}
}

View File

@@ -12,11 +12,13 @@ use std::{
ops::Deref,
sync::atomic::{self, AtomicUsize},
};
use tokio::net::TcpStream;
use tokio::time::Instant;
use tokio_postgres::tls::NoTlsStream;
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use tokio_util::sync::CancellationToken;
use crate::cancellation::CancelClosure;
use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
@@ -463,14 +465,16 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
}
}
#[allow(clippy::too_many_arguments)]
pub fn poll_client<C: ClientInnerExt>(
global_pool: Arc<GlobalConnPool<C>>,
ctx: &RequestMonitoring,
conn_info: ConnInfo,
client: C,
mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
conn_id: uuid::Uuid,
aux: MetricsAuxInfo,
cancel_closure: CancelClosure,
) -> Client<C> {
let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
let mut session_id = ctx.session_id();
@@ -572,6 +576,7 @@ pub fn poll_client<C: ClientInnerExt>(
cancel,
aux,
conn_id,
cancel_closure,
};
Client::new(inner, conn_info, pool_clone)
}
@@ -582,6 +587,7 @@ struct ClientInner<C: ClientInnerExt> {
cancel: CancellationToken,
aux: MetricsAuxInfo,
conn_id: uuid::Uuid,
cancel_closure: CancelClosure,
}
impl<C: ClientInnerExt> Drop for ClientInner<C> {
@@ -646,7 +652,7 @@ impl<C: ClientInnerExt> Client<C> {
pool,
}
}
pub fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
pub fn inner(&mut self) -> (&mut C, &CancelClosure, Discard<'_, C>) {
let Self {
inner,
pool,
@@ -654,7 +660,11 @@ impl<C: ClientInnerExt> Client<C> {
span: _,
} = self;
let inner = inner.as_mut().expect("client inner should not be removed");
(&mut inner.inner, Discard { pool, conn_info })
(
&mut inner.inner,
&inner.cancel_closure,
Discard { pool, conn_info },
)
}
}
@@ -751,6 +761,7 @@ mod tests {
cold_start_info: crate::console::messages::ColdStartInfo::Warm,
},
conn_id: uuid::Uuid::new_v4(),
cancel_closure: CancelClosure::test(),
}
}
@@ -785,7 +796,7 @@ mod tests {
{
let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
assert_eq!(0, pool.get_global_connections_count());
client.inner().1.discard();
client.inner().2.discard();
// Discard should not add the connection from the pool.
assert_eq!(0, pool.get_global_connections_count());
}

View File

@@ -26,7 +26,6 @@ use tokio_postgres::error::ErrorPosition;
use tokio_postgres::error::SqlState;
use tokio_postgres::GenericClient;
use tokio_postgres::IsolationLevel;
use tokio_postgres::NoTls;
use tokio_postgres::ReadyForQueryStatus;
use tokio_postgres::Transaction;
use tokio_util::sync::CancellationToken;
@@ -261,7 +260,9 @@ pub async fn handle(
let mut message = e.to_string_client();
let db_error = match &e {
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(
crate::compute::ConnectionError::Postgres(e),
))
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
_ => None,
};
@@ -622,8 +623,7 @@ impl QueryData {
client: &mut Client<tokio_postgres::Client>,
parsed_headers: HttpHeaders,
) -> Result<String, SqlOverHttpError> {
let (inner, mut discard) = client.inner();
let cancel_token = inner.cancel_token();
let (inner, cancel_token, mut discard) = client.inner();
let res = match select(
pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
@@ -647,7 +647,7 @@ impl QueryData {
// The query was cancelled.
Either::Right((_cancelled, query)) => {
tracing::info!("cancelling query");
if let Err(err) = cancel_token.cancel_query(NoTls).await {
if let Err(err) = cancel_token.clone().try_cancel_query().await {
tracing::error!(?err, "could not cancel query");
}
// wait for the query cancellation
@@ -663,7 +663,9 @@ impl QueryData {
// query failed or was cancelled.
Ok(Err(error)) => {
let db_error = match &error {
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(
crate::compute::ConnectionError::Postgres(e),
))
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
_ => None,
};
@@ -694,8 +696,7 @@ impl BatchQueryData {
parsed_headers: HttpHeaders,
) -> Result<String, SqlOverHttpError> {
info!("starting transaction");
let (inner, mut discard) = client.inner();
let cancel_token = inner.cancel_token();
let (inner, cancel_token, mut discard) = client.inner();
let mut builder = inner.build_transaction();
if let Some(isolation_level) = parsed_headers.txn_isolation_level {
builder = builder.isolation_level(isolation_level);
@@ -728,7 +729,7 @@ impl BatchQueryData {
json_output
}
Err(SqlOverHttpError::Cancelled(_)) => {
if let Err(err) = cancel_token.cancel_query(NoTls).await {
if let Err(err) = cancel_token.clone().try_cancel_query().await {
tracing::error!(?err, "could not cancel query");
}
// TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.

View File

@@ -92,7 +92,7 @@ impl TermHistory {
}
/// Find point of divergence between leader (walproposer) term history and
/// safekeeper. Arguments are not symmetric as proposer history ends at
/// safekeeper. Arguments are not symmetrics as proposer history ends at
/// +infinity while safekeeper at flush_lsn.
/// C version is at walproposer SendProposerElected.
pub fn find_highest_common_point(
@@ -701,13 +701,7 @@ where
.with_label_values(&["handle_elected"])
.start_timer();
info!(
"received ProposerElected {:?}, term={}, last_log_term={}, flush_lsn={}",
msg,
self.state.acceptor_state.term,
self.get_last_log_term(),
self.flush_lsn()
);
info!("received ProposerElected {:?}", msg);
if self.state.acceptor_state.term < msg.term {
let mut state = self.state.start_change();
state.acceptor_state.term = msg.term;
@@ -719,43 +713,22 @@ where
return Ok(None);
}
// Before truncating WAL check-cross the check divergence point received
// from the walproposer.
let sk_th = self.get_term_history();
let last_common_point = match TermHistory::find_highest_common_point(
&msg.term_history,
&sk_th,
self.flush_lsn(),
) {
// No common point. Expect streaming from the beginning of the
// history like walproposer while we don't have proper init.
None => *msg.term_history.0.first().ok_or(anyhow::anyhow!(
"empty walproposer term history {:?}",
msg.term_history
))?,
Some(lcp) => lcp,
};
// This is expected to happen in a rare race when another connection
// from the same walproposer writes + flushes WAL after this connection
// sent flush_lsn in VoteRequest; for instance, very late
// ProposerElected message delivery after another connection was
// established and wrote WAL. In such cases error is transient;
// reconnection makes safekeeper send newest term history and flush_lsn
// and walproposer recalculates the streaming point. OTOH repeating
// error indicates a serious bug.
if last_common_point.lsn != msg.start_streaming_at {
bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
last_common_point, msg.start_streaming_at,
self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
);
// This might happen in a rare race when another (old) connection from
// the same walproposer writes + flushes WAL after this connection
// already sent flush_lsn in VoteRequest. It is generally safe to
// proceed, but to prevent commit_lsn surprisingly going down we should
// either refuse the session (simpler) or skip the part we already have
// from the stream (can be implemented).
if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
msg.term, self.flush_lsn(), msg.start_streaming_at)
}
// We are also expected to never attempt to truncate committed data.
// Otherwise we must never attempt to truncate committed data.
assert!(
msg.start_streaming_at >= self.state.inmem.commit_lsn,
"attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
msg.start_streaming_at, self.state.inmem.commit_lsn,
self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
"attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
msg.start_streaming_at,
self.state.inmem.commit_lsn
);
// Before first WAL write initialize its segment. It makes first segment
@@ -770,6 +743,9 @@ where
.await?;
}
// TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
// intersection of our history and history from msg
// truncate wal, update the LSNs
self.wal_store.truncate_wal(msg.start_streaming_at).await?;
@@ -1093,7 +1069,7 @@ mod tests {
let pem = ProposerElected {
term: 1,
start_streaming_at: Lsn(3),
start_streaming_at: Lsn(1),
term_history: TermHistory(vec![TermLsn {
term: 1,
lsn: Lsn(3),

View File

@@ -520,19 +520,6 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
json_response(StatusCode::OK, node_status)
}
async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let leader = state.service.get_leader().await.map_err(|err| {
ApiError::InternalServerError(anyhow::anyhow!(
"Failed to read leader from database: {err}"
))
})?;
json_response(StatusCode::OK, leader)
}
async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
@@ -1029,9 +1016,6 @@ pub fn make_router(
.get("/control/v1/node/:node_id", |r| {
named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
})
.get("/control/v1/leader", |r| {
named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
})
.put("/control/v1/node/:node_id/drain", |r| {
named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
})

View File

@@ -1,7 +1,7 @@
use crate::tenant_shard::ObservedState;
use pageserver_api::shard::TenantShardId;
use serde::{Deserialize, Serialize};
use std::{collections::HashMap, time::Duration};
use std::collections::HashMap;
use tokio_util::sync::CancellationToken;
use hyper::Uri;
@@ -69,8 +69,6 @@ impl PeerClient {
req
};
let req = req.timeout(Duration::from_secs(2));
let res = req
.send()
.await

View File

@@ -20,8 +20,7 @@ use crate::{
metrics,
peer_client::{GlobalObservedState, PeerClient},
persistence::{
AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
TenantFilter,
AbortShardSplitStatus, ControllerPersistence, MetadataHealthPersistence, TenantFilter,
},
reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
@@ -490,6 +489,11 @@ pub(crate) enum ReconcileResultRequest {
Stop,
}
struct LeaderStepDownState {
observed: GlobalObservedState,
leader: ControllerPersistence,
}
impl Service {
pub fn get_config(&self) -> &Config {
&self.config
@@ -500,8 +504,7 @@ impl Service {
#[instrument(skip_all)]
async fn startup_reconcile(
self: &Arc<Service>,
current_leader: Option<ControllerPersistence>,
leader_step_down_state: Option<GlobalObservedState>,
leader_step_down_state: Option<LeaderStepDownState>,
bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
Result<(), (TenantShardId, NotifyError)>,
>,
@@ -519,15 +522,17 @@ impl Service {
.checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
.expect("Reconcile timeout is a modest constant");
let observed = if let Some(state) = leader_step_down_state {
let (observed, current_leader) = if let Some(state) = leader_step_down_state {
tracing::info!(
"Using observed state received from leader at {}",
current_leader.as_ref().unwrap().address
state.leader.address,
);
state
(state.observed, Some(state.leader))
} else {
self.build_global_observed_state(node_scan_deadline).await
(
self.build_global_observed_state(node_scan_deadline).await,
None,
)
};
// Accumulate a list of any tenant locations that ought to be detached
@@ -1377,32 +1382,13 @@ impl Service {
};
let leadership_status = this.inner.read().unwrap().get_leadership_status();
let leader = match this.get_leader().await {
Ok(ok) => ok,
Err(err) => {
tracing::error!(
"Failed to query database for current leader: {err}. Aborting start-up ..."
);
std::process::exit(1);
}
};
let leader_step_down_state = match leadership_status {
LeadershipStatus::Candidate => {
if let Some(ref leader) = leader {
this.request_step_down(leader).await
} else {
tracing::info!(
"No leader found to request step down from. Will build observed state."
);
None
}
}
let peer_observed_state = match leadership_status {
LeadershipStatus::Candidate => this.request_step_down().await,
LeadershipStatus::Leader => None,
LeadershipStatus::SteppedDown => unreachable!(),
};
this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
.await;
drop(startup_completion);
@@ -4664,10 +4650,6 @@ impl Service {
))
}
pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
self.persistence.get_leader().await
}
pub(crate) async fn node_register(
&self,
register_req: NodeRegisterRequest,
@@ -6360,7 +6342,6 @@ impl Service {
pub(crate) async fn step_down(&self) -> GlobalObservedState {
tracing::info!("Received step down request from peer");
failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");
self.inner.write().unwrap().step_down();
// TODO: would it make sense to have a time-out for this?
@@ -6386,31 +6367,50 @@ impl Service {
///
/// On failures to query the database or step down error responses the process is killed
/// and we rely on k8s to retry.
async fn request_step_down(
&self,
leader: &ControllerPersistence,
) -> Option<GlobalObservedState> {
tracing::info!("Sending step down request to {leader:?}");
// TODO: jwt token
let client = PeerClient::new(
Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
self.config.jwt_token.clone(),
);
let state = client.step_down(&self.cancel).await;
match state {
Ok(state) => Some(state),
async fn request_step_down(&self) -> Option<LeaderStepDownState> {
let leader = match self.persistence.get_leader().await {
Ok(leader) => leader,
Err(err) => {
// TODO: Make leaders periodically update a timestamp field in the
// database and, if the leader is not reachable from the current instance,
// but inferred as alive from the timestamp, abort start-up. This avoids
// a potential scenario in which we have two controllers acting as leaders.
tracing::error!(
"Leader ({}) did not respond to step-down request: {}",
leader.address,
err
"Failed to query database for current leader: {err}. Aborting start-up ..."
);
std::process::exit(1);
}
};
match leader {
Some(leader) => {
tracing::info!("Sending step down request to {leader:?}");
// TODO: jwt token
let client = PeerClient::new(
Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
self.config.jwt_token.clone(),
);
let state = client.step_down(&self.cancel).await;
match state {
Ok(state) => Some(LeaderStepDownState {
observed: state,
leader: leader.clone(),
}),
Err(err) => {
// TODO: Make leaders periodically update a timestamp field in the
// database and, if the leader is not reachable from the current instance,
// but inferred as alive from the timestamp, abort start-up. This avoids
// a potential scenario in which we have two controllers acting as leaders.
tracing::error!(
"Leader ({}) did not respond to step-down request: {}",
leader.address,
err
);
None
}
}
}
None => {
tracing::info!(
"No leader found to request step down from. Will build observed state."
);
None
}
}

View File

@@ -3,10 +3,9 @@ use camino::Utf8PathBuf;
use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
use pageserver_api::shard::TenantShardId;
use reqwest::{Method, Url};
use storage_controller_client::control_api;
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
use storage_scrubber::pageserver_physical_gc::GcMode;
use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata;
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
use storage_scrubber::tenant_snapshot::SnapshotDownloader;
use storage_scrubber::{find_large_objects, ControllerClientConfig};
use storage_scrubber::{
@@ -69,7 +68,7 @@ enum Command {
#[arg(long = "tenant-id", num_args = 0..)]
tenant_ids: Vec<TenantShardId>,
#[arg(long = "post", default_value_t = false)]
post_to_storcon: bool,
post_to_storage_controller: bool,
#[arg(long, default_value = None)]
/// For safekeeper node_kind only, points to db with debug dump
dump_db_connstr: Option<String>,
@@ -101,16 +100,6 @@ enum Command {
#[arg(long = "concurrency", short = 'j', default_value_t = 64)]
concurrency: usize,
},
CronJob {
// PageserverPhysicalGc
#[arg(long = "min-age")]
gc_min_age: humantime::Duration,
#[arg(short, long, default_value_t = GcMode::IndicesOnly)]
gc_mode: GcMode,
// ScanMetadata
#[arg(long = "post", default_value_t = false)]
post_to_storcon: bool,
},
}
#[tokio::main]
@@ -128,7 +117,6 @@ async fn main() -> anyhow::Result<()> {
Command::TenantSnapshot { .. } => "tenant-snapshot",
Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
Command::FindLargeObjects { .. } => "find-large-objects",
Command::CronJob { .. } => "cron-job",
};
let _guard = init_logging(&format!(
"{}_{}_{}_{}.log",
@@ -138,13 +126,12 @@ async fn main() -> anyhow::Result<()> {
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
));
let controller_client = cli.controller_api.map(|controller_api| {
let controller_client_conf = cli.controller_api.map(|controller_api| {
ControllerClientConfig {
controller_api,
// Default to no key: this is a convenience when working in a development environment
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
}
.build_client()
});
match cli.command {
@@ -152,7 +139,7 @@ async fn main() -> anyhow::Result<()> {
json,
tenant_ids,
node_kind,
post_to_storcon,
post_to_storage_controller,
dump_db_connstr,
dump_db_table,
} => {
@@ -191,14 +178,53 @@ async fn main() -> anyhow::Result<()> {
}
Ok(())
} else {
scan_pageserver_metadata_cmd(
bucket_config,
controller_client.as_ref(),
tenant_ids,
json,
post_to_storcon,
)
.await
if controller_client_conf.is_none() && post_to_storage_controller {
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
}
match scan_metadata(bucket_config.clone(), tenant_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
}
Ok(summary) => {
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
}
if post_to_storage_controller {
if let Some(conf) = controller_client_conf {
let controller_client = conf.build_client();
let body = summary.build_health_update_request();
controller_client
.dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
Method::POST,
"control/v1/metadata_health/update".to_string(),
Some(body),
)
.await?;
}
}
if summary.is_fatal() {
tracing::error!("Fatal scrub errors detected");
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
tracing::error!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
);
}
Ok(())
}
}
}
}
Command::FindGarbage {
@@ -228,14 +254,31 @@ async fn main() -> anyhow::Result<()> {
min_age,
mode,
} => {
pageserver_physical_gc_cmd(
&bucket_config,
controller_client.as_ref(),
match (&controller_client_conf, mode) {
(Some(_), _) => {
// Any mode may run when controller API is set
}
(None, GcMode::Full) => {
// The part of physical GC where we erase ancestor layers cannot be done safely without
// confirming the most recent complete shard split with the controller. Refuse to run, rather
// than doing it unsafely.
return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run"));
}
(None, GcMode::DryRun | GcMode::IndicesOnly) => {
// These GcModes do not require the controller to run.
}
}
let summary = pageserver_physical_gc(
bucket_config,
controller_client_conf,
tenant_ids,
min_age,
min_age.into(),
mode,
)
.await
.await?;
println!("{}", serde_json::to_string(&summary).unwrap());
Ok(())
}
Command::FindLargeObjects {
min_size,
@@ -252,142 +295,5 @@ async fn main() -> anyhow::Result<()> {
println!("{}", serde_json::to_string(&summary).unwrap());
Ok(())
}
Command::CronJob {
gc_min_age,
gc_mode,
post_to_storcon,
} => {
run_cron_job(
bucket_config,
controller_client.as_ref(),
gc_min_age,
gc_mode,
post_to_storcon,
)
.await
}
}
}
/// Runs the scrubber cron job.
/// 1. Do pageserver physical gc
/// 2. Scan pageserver metadata
pub async fn run_cron_job(
bucket_config: BucketConfig,
controller_client: Option<&control_api::Client>,
gc_min_age: humantime::Duration,
gc_mode: GcMode,
post_to_storcon: bool,
) -> anyhow::Result<()> {
tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc");
pageserver_physical_gc_cmd(
&bucket_config,
controller_client,
Vec::new(),
gc_min_age,
gc_mode,
)
.await?;
tracing::info!(%post_to_storcon, node_kind = %NodeKind::Pageserver, "Running scan-metadata");
scan_pageserver_metadata_cmd(
bucket_config,
controller_client,
Vec::new(),
true,
post_to_storcon,
)
.await?;
Ok(())
}
pub async fn pageserver_physical_gc_cmd(
bucket_config: &BucketConfig,
controller_client: Option<&control_api::Client>,
tenant_shard_ids: Vec<TenantShardId>,
min_age: humantime::Duration,
mode: GcMode,
) -> anyhow::Result<()> {
match (controller_client, mode) {
(Some(_), _) => {
// Any mode may run when controller API is set
}
(None, GcMode::Full) => {
// The part of physical GC where we erase ancestor layers cannot be done safely without
// confirming the most recent complete shard split with the controller. Refuse to run, rather
// than doing it unsafely.
return Err(anyhow!(
"Full physical GC requires `--controller-api` and `--controller-jwt` to run"
));
}
(None, GcMode::DryRun | GcMode::IndicesOnly) => {
// These GcModes do not require the controller to run.
}
}
let summary = pageserver_physical_gc(
bucket_config,
controller_client,
tenant_shard_ids,
min_age.into(),
mode,
)
.await?;
println!("{}", serde_json::to_string(&summary).unwrap());
Ok(())
}
pub async fn scan_pageserver_metadata_cmd(
bucket_config: BucketConfig,
controller_client: Option<&control_api::Client>,
tenant_shard_ids: Vec<TenantShardId>,
json: bool,
post_to_storcon: bool,
) -> anyhow::Result<()> {
if controller_client.is_none() && post_to_storcon {
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
}
match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
}
Ok(summary) => {
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
}
if post_to_storcon {
if let Some(client) = controller_client {
let body = summary.build_health_update_request();
client
.dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
Method::POST,
"control/v1/metadata_health/update".to_string(),
Some(body),
)
.await?;
}
}
if summary.is_fatal() {
tracing::error!("Fatal scrub errors detected");
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
tracing::error!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
);
}
Ok(())
}
}
}

View File

@@ -4,7 +4,9 @@ use std::time::{Duration, SystemTime};
use crate::checks::{list_timeline_blobs, BlobDataParseResult};
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
use crate::{
init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId,
};
use aws_sdk_s3::Client;
use futures_util::{StreamExt, TryStreamExt};
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -471,8 +473,8 @@ async fn gc_ancestor(
/// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
/// make sure that object listings don't get slowed down by large numbers of garbage objects.
pub async fn pageserver_physical_gc(
bucket_config: &BucketConfig,
controller_client: Option<&control_api::Client>,
bucket_config: BucketConfig,
controller_client_conf: Option<ControllerClientConfig>,
tenant_shard_ids: Vec<TenantShardId>,
min_age: Duration,
mode: GcMode,
@@ -556,7 +558,7 @@ pub async fn pageserver_physical_gc(
let timelines = timelines.map_ok(|ttid| {
gc_timeline(
&s3_client,
bucket_config,
&bucket_config,
&min_age,
&target,
mode,
@@ -572,7 +574,7 @@ pub async fn pageserver_physical_gc(
}
// Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
let Some(client) = controller_client else {
let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
return Ok(summary);
};
@@ -581,13 +583,13 @@ pub async fn pageserver_physical_gc(
.unwrap()
.into_inner()
.unwrap()
.into_gc_ancestors(client, &mut summary)
.into_gc_ancestors(&controller_client, &mut summary)
.await;
for ancestor_shard in ancestor_shards {
gc_ancestor(
&s3_client,
bucket_config,
&bucket_config,
&target,
&min_age,
ancestor_shard,

View File

@@ -116,7 +116,7 @@ Index versions: {version_summary}
}
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
pub async fn scan_pageserver_metadata(
pub async fn scan_metadata(
bucket_config: BucketConfig,
tenant_ids: Vec<TenantShardId>,
) -> anyhow::Result<MetadataSummary> {

View File

@@ -3,7 +3,6 @@ pytest_plugins = (
"fixtures.parametrize",
"fixtures.httpserver",
"fixtures.compute_reconfigure",
"fixtures.storage_controller_proxy",
"fixtures.neon_fixtures",
"fixtures.benchmark_fixture",
"fixtures.pg_stats",

View File

@@ -497,7 +497,6 @@ class NeonEnvBuilder:
pageserver_aux_file_policy: Optional[AuxFileStore] = None,
pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
safekeeper_extra_opts: Optional[list[str]] = None,
storage_controller_port_override: Optional[int] = None,
):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
@@ -550,8 +549,6 @@ class NeonEnvBuilder:
self.safekeeper_extra_opts = safekeeper_extra_opts
self.storage_controller_port_override = storage_controller_port_override
assert test_name.startswith(
"test_"
), "Unexpectedly instantiated from outside a test function"
@@ -1057,7 +1054,6 @@ class NeonEnv:
"""
BASE_PAGESERVER_ID = 1
storage_controller: NeonStorageController | NeonProxiedStorageController
def __init__(self, config: NeonEnvBuilder):
self.repo_dir = config.repo_dir
@@ -1088,41 +1084,27 @@ class NeonEnv:
self.initial_tenant = config.initial_tenant
self.initial_timeline = config.initial_timeline
# Find two adjacent ports for storage controller and its postgres DB. This
# loop would eventually throw from get_port() if we run out of ports (extremely
# unlikely): usually we find two adjacent free ports on the first iteration.
while True:
self.storage_controller_port = self.port_distributor.get_port()
storage_controller_pg_port = self.port_distributor.get_port()
if storage_controller_pg_port == self.storage_controller_port + 1:
break
# The URL for the pageserver to use as its control_plane_api config
if config.storage_controller_port_override is not None:
log.info(
f"Using storage controller api override {config.storage_controller_port_override}"
)
self.storage_controller_port = config.storage_controller_port_override
self.storage_controller = NeonProxiedStorageController(
self, config.storage_controller_port_override, config.auth_enabled
)
else:
# Find two adjacent ports for storage controller and its postgres DB. This
# loop would eventually throw from get_port() if we run out of ports (extremely
# unlikely): usually we find two adjacent free ports on the first iteration.
while True:
storage_controller_port = self.port_distributor.get_port()
storage_controller_pg_port = self.port_distributor.get_port()
if storage_controller_pg_port == storage_controller_port + 1:
break
self.storage_controller_port = storage_controller_port
self.storage_controller = NeonStorageController(
self, storage_controller_port, config.auth_enabled
)
log.info(
f"Using generated control_plane_api: {self.storage_controller.upcall_api_endpoint()}"
)
self.storage_controller_api: str = self.storage_controller.api_root()
self.control_plane_api: str = self.storage_controller.upcall_api_endpoint()
self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1"
# The base URL of the storage controller
self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}"
# For testing this with a fake HTTP server, enable passing through a URL from config
self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
self.storage_controller: NeonStorageController = NeonStorageController(
self, config.auth_enabled
)
self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
@@ -1162,6 +1144,7 @@ class NeonEnv:
"listen_http_addr": f"localhost:{pageserver_port.http}",
"pg_auth_type": pg_auth_type,
"http_auth_type": http_auth_type,
"image_compression": "zstd",
}
if self.pageserver_virtual_file_io_engine is not None:
ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
@@ -1886,24 +1869,16 @@ class NeonCli(AbstractNeonCli):
def storage_controller_start(
self,
timeout_in_seconds: Optional[int] = None,
instance_id: Optional[int] = None,
base_port: Optional[int] = None,
):
cmd = ["storage_controller", "start"]
if timeout_in_seconds is not None:
cmd.append(f"--start-timeout={timeout_in_seconds}s")
if instance_id is not None:
cmd.append(f"--instance-id={instance_id}")
if base_port is not None:
cmd.append(f"--base-port={base_port}")
return self.raw_cli(cmd)
def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None):
def storage_controller_stop(self, immediate: bool):
cmd = ["storage_controller", "stop"]
if immediate:
cmd.extend(["-m", "immediate"])
if instance_id is not None:
cmd.append(f"--instance-id={instance_id}")
return self.raw_cli(cmd)
def pageserver_start(
@@ -2214,30 +2189,17 @@ class PageserverSchedulingPolicy(str, Enum):
PAUSE_FOR_RESTART = "PauseForRestart"
class StorageControllerLeadershipStatus(str, Enum):
LEADER = "leader"
STEPPED_DOWN = "stepped_down"
CANDIDATE = "candidate"
class NeonStorageController(MetricsGetter, LogUtils):
def __init__(self, env: NeonEnv, port: int, auth_enabled: bool):
def __init__(self, env: NeonEnv, auth_enabled: bool):
self.env = env
self.port: int = port
self.api: str = f"http://127.0.0.1:{port}"
self.running = False
self.auth_enabled = auth_enabled
self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
self.logfile = self.env.repo_dir / "storage_controller_1" / "storage_controller.log"
self.logfile = self.workdir / "storage_controller.log"
def start(
self,
timeout_in_seconds: Optional[int] = None,
instance_id: Optional[int] = None,
base_port: Optional[int] = None,
):
def start(self, timeout_in_seconds: Optional[int] = None):
assert not self.running
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
self.env.neon_cli.storage_controller_start(timeout_in_seconds)
self.running = True
return self
@@ -2247,12 +2209,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
self.running = False
return self
def upcall_api_endpoint(self) -> str:
return f"{self.api}/upcall/v1"
def api_root(self) -> str:
return self.api
@staticmethod
def retryable_node_operation(op, ps_id, max_attempts, backoff):
while max_attempts > 0:
@@ -2281,9 +2237,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
def assert_no_errors(self):
assert_no_errors(
self.logfile,
"storage_controller",
self.allowed_errors,
self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
)
def pageserver_api(self) -> PageserverHttpClient:
@@ -2295,7 +2249,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
auth_token = None
if self.auth_enabled:
auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
return PageserverHttpClient(self.port, lambda: True, auth_token)
return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token)
def request(self, method, *args, **kwargs) -> requests.Response:
resp = requests.request(method, *args, **kwargs)
@@ -2312,13 +2266,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
return headers
def get_metrics(self) -> Metrics:
res = self.request("GET", f"{self.api}/metrics")
res = self.request("GET", f"{self.env.storage_controller_api}/metrics")
return parse_metrics(res.text)
def ready(self) -> bool:
status = None
try:
resp = self.request("GET", f"{self.api}/ready")
resp = self.request("GET", f"{self.env.storage_controller_api}/ready")
status = resp.status_code
except StorageControllerApiException as e:
status = e.status_code
@@ -2351,7 +2305,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
response = self.request(
"POST",
f"{self.api}/debug/v1/attach-hook",
f"{self.env.storage_controller_api}/debug/v1/attach-hook",
json=body,
headers=self.headers(TokenScope.ADMIN),
)
@@ -2362,7 +2316,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
self.request(
"POST",
f"{self.api}/debug/v1/attach-hook",
f"{self.env.storage_controller_api}/debug/v1/attach-hook",
json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
headers=self.headers(TokenScope.ADMIN),
)
@@ -2373,7 +2327,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
"""
response = self.request(
"POST",
f"{self.api}/debug/v1/inspect",
f"{self.env.storage_controller_api}/debug/v1/inspect",
json={"tenant_shard_id": str(tenant_shard_id)},
headers=self.headers(TokenScope.ADMIN),
)
@@ -2396,7 +2350,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
log.info(f"node_register({body})")
self.request(
"POST",
f"{self.api}/control/v1/node",
f"{self.env.storage_controller_api}/control/v1/node",
json=body,
headers=self.headers(TokenScope.ADMIN),
)
@@ -2405,7 +2359,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
log.info(f"node_delete({node_id})")
self.request(
"DELETE",
f"{self.api}/control/v1/node/{node_id}",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
headers=self.headers(TokenScope.ADMIN),
)
@@ -2413,7 +2367,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
log.info(f"node_drain({node_id})")
self.request(
"PUT",
f"{self.api}/control/v1/node/{node_id}/drain",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
headers=self.headers(TokenScope.ADMIN),
)
@@ -2421,7 +2375,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
log.info(f"cancel_node_drain({node_id})")
self.request(
"DELETE",
f"{self.api}/control/v1/node/{node_id}/drain",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
headers=self.headers(TokenScope.ADMIN),
)
@@ -2429,7 +2383,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
log.info(f"node_fill({node_id})")
self.request(
"PUT",
f"{self.api}/control/v1/node/{node_id}/fill",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
headers=self.headers(TokenScope.ADMIN),
)
@@ -2437,22 +2391,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
log.info(f"cancel_node_fill({node_id})")
self.request(
"DELETE",
f"{self.api}/control/v1/node/{node_id}/fill",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
headers=self.headers(TokenScope.ADMIN),
)
def node_status(self, node_id):
response = self.request(
"GET",
f"{self.api}/control/v1/node/{node_id}",
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
def get_leader(self):
response = self.request(
"GET",
f"{self.api}/control/v1/leader",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
@@ -2460,7 +2406,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
def node_list(self):
response = self.request(
"GET",
f"{self.api}/control/v1/node",
f"{self.env.storage_controller_api}/control/v1/node",
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
@@ -2468,7 +2414,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
def tenant_list(self):
response = self.request(
"GET",
f"{self.api}/debug/v1/tenant",
f"{self.env.storage_controller_api}/debug/v1/tenant",
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
@@ -2478,7 +2424,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
body["node_id"] = node_id
self.request(
"PUT",
f"{self.api}/control/v1/node/{node_id}/config",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config",
json=body,
headers=self.headers(TokenScope.ADMIN),
)
@@ -2513,7 +2459,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
response = self.request(
"POST",
f"{self.api}/v1/tenant",
f"{self.env.storage_controller_api}/v1/tenant",
json=body,
headers=self.headers(TokenScope.PAGE_SERVER_API),
)
@@ -2526,7 +2472,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
"""
response = self.request(
"GET",
f"{self.api}/debug/v1/tenant/{tenant_id}/locate",
f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
headers=self.headers(TokenScope.ADMIN),
)
body = response.json()
@@ -2539,7 +2485,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
"""
response = self.request(
"GET",
f"{self.api}/control/v1/tenant/{tenant_id}",
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}",
headers=self.headers(TokenScope.ADMIN),
)
response.raise_for_status()
@@ -2550,7 +2496,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
) -> list[TenantShardId]:
response = self.request(
"PUT",
f"{self.api}/control/v1/tenant/{tenant_id}/shard_split",
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size},
headers=self.headers(TokenScope.ADMIN),
)
@@ -2562,7 +2508,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
self.request(
"PUT",
f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate",
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate",
json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
headers=self.headers(TokenScope.ADMIN),
)
@@ -2573,7 +2519,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
log.info(f"tenant_policy_update({tenant_id}, {body})")
self.request(
"PUT",
f"{self.api}/control/v1/tenant/{tenant_id}/policy",
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy",
json=body,
headers=self.headers(TokenScope.ADMIN),
)
@@ -2581,14 +2527,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
def tenant_import(self, tenant_id: TenantId):
self.request(
"POST",
f"{self.api}/debug/v1/tenant/{tenant_id}/import",
f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import",
headers=self.headers(TokenScope.ADMIN),
)
def reconcile_all(self):
r = self.request(
"POST",
f"{self.api}/debug/v1/reconcile_all",
f"{self.env.storage_controller_api}/debug/v1/reconcile_all",
headers=self.headers(TokenScope.ADMIN),
)
r.raise_for_status()
@@ -2621,7 +2567,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
"""
self.request(
"POST",
f"{self.api}/debug/v1/consistency_check",
f"{self.env.storage_controller_api}/debug/v1/consistency_check",
headers=self.headers(TokenScope.ADMIN),
)
log.info("storage controller passed consistency check")
@@ -2694,7 +2640,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
self.request(
"POST",
f"{self.api}/control/v1/metadata_health/update",
f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
json=body,
headers=self.headers(TokenScope.SCRUBBER),
)
@@ -2702,7 +2648,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
def metadata_health_list_unhealthy(self):
response = self.request(
"GET",
f"{self.api}/control/v1/metadata_health/unhealthy",
f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
@@ -2712,7 +2658,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
response = self.request(
"POST",
f"{self.api}/control/v1/metadata_health/outdated",
f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
json=body,
headers=self.headers(TokenScope.ADMIN),
)
@@ -2735,7 +2681,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
log.info("Asking storage controller to step down")
response = self.request(
"PUT",
f"{self.api}/control/v1/step_down",
f"{self.env.storage_controller_api}/control/v1/step_down",
headers=self.headers(TokenScope.ADMIN),
)
@@ -2752,7 +2698,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
res = self.request(
"PUT",
f"{self.api}/debug/v1/failpoints",
f"{self.env.storage_controller_api}/debug/v1/failpoints",
json=[{"name": name, "actions": actions} for name, actions in pairs],
headers=self.headers(TokenScope.ADMIN),
)
@@ -2822,21 +2768,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
parsed_tid, wait_ms=250
)
def get_leadership_status(self) -> StorageControllerLeadershipStatus:
metric_values = {}
for status in StorageControllerLeadershipStatus:
metric_value = self.get_metric_value(
"storage_controller_leadership_status", filter={"status": status}
)
metric_values[status] = metric_value
assert list(metric_values.values()).count(1) == 1
for status, metric_value in metric_values.items():
if metric_value == 1:
return status
raise AssertionError("unreachable")
@property
def workdir(self) -> Path:
return self.env.repo_dir
def __enter__(self) -> "NeonStorageController":
return self
@@ -2850,59 +2784,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
self.stop(immediate=True)
class NeonProxiedStorageController(NeonStorageController):
def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool):
super(NeonProxiedStorageController, self).__init__(env, proxy_port, auth_enabled)
self.instances: dict[int, dict[str, Any]] = {}
def start(
self,
timeout_in_seconds: Optional[int] = None,
instance_id: Optional[int] = None,
base_port: Optional[int] = None,
):
assert instance_id is not None and base_port is not None
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
self.instances[instance_id] = {"running": True}
self.running = True
return self
def stop_instance(
self, immediate: bool = False, instance_id: Optional[int] = None
) -> "NeonStorageController":
assert instance_id in self.instances
if self.instances[instance_id]["running"]:
self.env.neon_cli.storage_controller_stop(immediate, instance_id)
self.instances[instance_id]["running"] = False
self.running = any(meta["running"] for meta in self.instances.values())
return self
def stop(self, immediate: bool = False) -> "NeonStorageController":
for iid, details in self.instances.items():
if details["running"]:
self.env.neon_cli.storage_controller_stop(immediate, iid)
self.instances[iid]["running"] = False
self.running = False
return self
def assert_no_errors(self):
for instance_id in self.instances.keys():
assert_no_errors(
self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log",
"storage_controller",
self.allowed_errors,
)
def log_contains(
self, pattern: str, offset: None | LogCursor = None
) -> Optional[Tuple[str, LogCursor]]:
raise NotImplementedError()
@dataclass
class LogCursor:
_line_no: int
@@ -4639,7 +4520,7 @@ class StorageScrubber:
base_args = [
str(self.env.neon_binpath / "storage_scrubber"),
f"--controller-api={self.env.storage_controller.api_root()}",
f"--controller-api={self.env.storage_controller_api}",
]
args = base_args + args

View File

@@ -1,73 +0,0 @@
import re
from typing import Any, Optional
import pytest
import requests
from pytest_httpserver import HTTPServer
from werkzeug.datastructures import Headers
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
from fixtures.log_helper import log
class StorageControllerProxy:
def __init__(self, server: HTTPServer):
self.server: HTTPServer = server
self.listen: str = f"http://{server.host}:{server.port}"
self.routing_to: Optional[str] = None
def route_to(self, storage_controller_api: str):
self.routing_to = storage_controller_api
def port(self) -> int:
return self.server.port
def upcall_api_endpoint(self) -> str:
return f"{self.listen}/upcall/v1"
def proxy_request(method: str, url: str, **kwargs) -> requests.Response:
return requests.request(method, url, **kwargs)
@pytest.fixture(scope="function")
def storage_controller_proxy(make_httpserver):
"""
Proxies requests into the storage controller to the currently
selected storage controller instance via `StorageControllerProxy.route_to`.
This fixture is intended for tests that need to run multiple instances
of the storage controller at the same time.
"""
server = make_httpserver
self = StorageControllerProxy(server)
log.info(f"Storage controller proxy listening on {self.listen}")
def handler(request: Request):
if self.route_to is None:
log.info(f"Storage controller proxy has no routing configured for {request.url}")
return Response("Routing not configured", status=503)
route_to_url = f"{self.routing_to}{request.path}"
log.info(f"Routing {request.url} to {route_to_url}")
args: dict[str, Any] = {"headers": request.headers}
if request.is_json:
args["json"] = request.json
response = proxy_request(request.method, route_to_url, **args)
headers = Headers()
for key, value in response.headers.items():
headers.add(key, value)
return Response(response.content, headers=headers, status=response.status_code)
self.server.expect_request(re.compile(".*")).respond_with_handler(handler)
yield self
server.clear()

View File

@@ -403,7 +403,7 @@ def wait_until(
try:
res = func()
except Exception as e:
log.info("waiting for %s iteration %s failed: %s", func, i + 1, e)
log.info("waiting for %s iteration %s failed", func, i + 1)
last_exception = e
if show_intermediate_error:
log.info(e)

View File

@@ -282,16 +282,15 @@ def test_snap_files(
env = benchmark_project_pub.pgbench_env
connstr = benchmark_project_pub.connstr
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
with psycopg2.connect(connstr) as conn:
conn.autocommit = True
with conn.cursor() as cur:
cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
is_super = cur.fetchall()[0][0]
is_super = cur.fetchall()[0]
assert is_super, "This benchmark won't work if we don't have superuser"
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
conn = psycopg2.connect(connstr)
conn.autocommit = True
cur = conn.cursor()

View File

@@ -1,7 +1,3 @@
import os
import random
import re
import subprocess
import threading
import time
@@ -21,17 +17,17 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
"test_lfc_resize",
config_lines=[
"neon.file_cache_path='file.cache'",
"neon.max_file_cache_size=512MB",
"neon.file_cache_size_limit=512MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
],
)
n_resize = 10
scale = 100
scale = 10
def run_pgbench(connstr: str):
log.info(f"Start a pgbench workload on pg {connstr}")
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr])
pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr])
thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
thread.start()
@@ -39,21 +35,9 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
conn = endpoint.connect()
cur = conn.cursor()
for _ in range(n_resize):
size = random.randint(1, 512)
cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'")
for i in range(n_resize):
cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'")
cur.execute("select pg_reload_conf()")
time.sleep(1)
cur.execute("alter system set neon.file_cache_size_limit='100MB'")
cur.execute("select pg_reload_conf()")
thread.join()
lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
lfc_file_size = os.path.getsize(lfc_file_path)
res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True)
lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
assert lfc_file_size <= 512 * 1024 * 1024
assert int(lfc_file_blocks) <= 128 * 1024

View File

@@ -1,4 +1,3 @@
import concurrent.futures
import json
import threading
import time
@@ -17,7 +16,6 @@ from fixtures.neon_fixtures import (
PageserverSchedulingPolicy,
PgBin,
StorageControllerApiException,
StorageControllerLeadershipStatus,
TokenScope,
last_flush_lsn_upload,
)
@@ -32,9 +30,7 @@ from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
)
from fixtures.pg_version import PgVersion
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import RemoteStorageKind, s3_storage
from fixtures.storage_controller_proxy import StorageControllerProxy
from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
from fixtures.workload import Workload
from mypy_boto3_s3.type_defs import (
@@ -2097,131 +2093,6 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
)
# This is a copy of NeonEnv.start which injects the instance id and port
# into the call to NeonStorageController.start
def start_env(env: NeonEnv, storage_controller_port: int):
timeout_in_seconds = 30
# Storage controller starts first, so that pageserver /re-attach calls don't
# bounce through retries on startup
env.storage_controller.start(timeout_in_seconds, 1, storage_controller_port)
# Wait for storage controller readiness to prevent unnecessary post start-up
# reconcile.
env.storage_controller.wait_until_ready()
# Start up broker, pageserver and all safekeepers
futs = []
with concurrent.futures.ThreadPoolExecutor(
max_workers=2 + len(env.pageservers) + len(env.safekeepers)
) as executor:
futs.append(
executor.submit(lambda: env.broker.try_start() or None)
) # The `or None` is for the linter
for pageserver in env.pageservers:
futs.append(
executor.submit(
lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
)
)
for safekeeper in env.safekeepers:
futs.append(
executor.submit(
lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
)
)
for f in futs:
f.result()
@pytest.mark.parametrize("step_down_times_out", [False, True])
def test_storage_controller_leadership_transfer(
neon_env_builder: NeonEnvBuilder,
storage_controller_proxy: StorageControllerProxy,
port_distributor: PortDistributor,
step_down_times_out: bool,
):
neon_env_builder.num_pageservers = 3
neon_env_builder.storage_controller_config = {
"database_url": f"127.0.0.1:{port_distributor.get_port()}",
"start_as_candidate": True,
}
neon_env_builder.storage_controller_port_override = storage_controller_proxy.port()
storage_controller_1_port = port_distributor.get_port()
storage_controller_2_port = port_distributor.get_port()
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
env = neon_env_builder.init_configs()
start_env(env, storage_controller_1_port)
assert (
env.storage_controller.get_leadership_status() == StorageControllerLeadershipStatus.LEADER
)
leader = env.storage_controller.get_leader()
assert leader["address"] == f"http://127.0.0.1:{storage_controller_1_port}/"
if step_down_times_out:
env.storage_controller.configure_failpoints(
("sleep-on-step-down-handling", "return(10000)")
)
env.storage_controller.allowed_errors.append(".*request was dropped before completing.*")
tenant_count = 2
shard_count = 4
tenants = set(TenantId.generate() for _ in range(0, tenant_count))
for tid in tenants:
env.storage_controller.tenant_create(
tid, shard_count=shard_count, placement_policy={"Attached": 1}
)
env.storage_controller.reconcile_until_idle()
env.storage_controller.start(
timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
)
if not step_down_times_out:
def previous_stepped_down():
assert (
env.storage_controller.get_leadership_status()
== StorageControllerLeadershipStatus.STEPPED_DOWN
)
wait_until(5, 1, previous_stepped_down)
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
def new_becomes_leader():
assert (
env.storage_controller.get_leadership_status()
== StorageControllerLeadershipStatus.LEADER
)
wait_until(15, 1, new_becomes_leader)
leader = env.storage_controller.get_leader()
assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/"
env.storage_controller.wait_until_ready()
env.storage_controller.consistency_check()
if step_down_times_out:
env.storage_controller.allowed_errors.extend(
[
".*Leader.*did not respond to step-down request.*",
".*Send step down request failed.*",
".*Send step down request still failed.*",
]
)
def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
# single unsharded tenant, two locations
neon_env_builder.num_pageservers = 2